41 lines
1.2 KiB
C#
41 lines
1.2 KiB
C#
using System.Text;
|
|
using HtmlAgilityPack;
|
|
|
|
namespace NewsArchival.Scraper.Validation;
|
|
|
|
public class Advertisement
|
|
{
|
|
public static string Strip(string htmlContent)
|
|
{
|
|
try
|
|
{
|
|
if (string.IsNullOrWhiteSpace(htmlContent)) return htmlContent;
|
|
|
|
var doc = new HtmlDocument();
|
|
doc.LoadHtml(htmlContent);
|
|
|
|
var xpath = "//p | //a | //*[starts-with(name(), 'h') and string-length(name()) = 2]";
|
|
var nodesToKeep = doc.DocumentNode.SelectNodes(xpath);
|
|
if (nodesToKeep is null) return string.Empty;
|
|
|
|
var sb = new StringBuilder();
|
|
foreach (var node in nodesToKeep)
|
|
{
|
|
// Get the text, trim whitespace, and add a newline for readability
|
|
string text = HtmlEntity.DeEntitize(node.InnerText).Trim();
|
|
if (!string.IsNullOrWhiteSpace(text))
|
|
{
|
|
sb.AppendLine(text);
|
|
sb.AppendLine(); // Adds spacing between paragraphs/headers
|
|
}
|
|
}
|
|
|
|
return sb.ToString().Trim();
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
Console.WriteLine(e);
|
|
throw;
|
|
}
|
|
}
|
|
} |