NewsArchival/NewsArchival.Scraper/Validation/Advertisement.cs

41 lines
1.2 KiB
C#

using System.Text;
using HtmlAgilityPack;
namespace NewsArchival.Scraper.Validation;
public class Advertisement
{
public static string Strip(string htmlContent)
{
try
{
if (string.IsNullOrWhiteSpace(htmlContent)) return htmlContent;
var doc = new HtmlDocument();
doc.LoadHtml(htmlContent);
var xpath = "//p | //a | //*[starts-with(name(), 'h') and string-length(name()) = 2]";
var nodesToKeep = doc.DocumentNode.SelectNodes(xpath);
if (nodesToKeep is null) return string.Empty;
var sb = new StringBuilder();
foreach (var node in nodesToKeep)
{
// Get the text, trim whitespace, and add a newline for readability
string text = HtmlEntity.DeEntitize(node.InnerText).Trim();
if (!string.IsNullOrWhiteSpace(text))
{
sb.AppendLine(text);
sb.AppendLine(); // Adds spacing between paragraphs/headers
}
}
return sb.ToString().Trim();
}
catch (Exception e)
{
Console.WriteLine(e);
throw;
}
}
}