using System.Text; using HtmlAgilityPack; namespace NewsArchival.Scraper.Validation; public class Advertisement { public static string Strip(string htmlContent) { try { if (string.IsNullOrWhiteSpace(htmlContent)) return htmlContent; var doc = new HtmlDocument(); doc.LoadHtml(htmlContent); var xpath = "//p | //a | //*[starts-with(name(), 'h') and string-length(name()) = 2]"; var nodesToKeep = doc.DocumentNode.SelectNodes(xpath); if (nodesToKeep is null) return string.Empty; var sb = new StringBuilder(); foreach (var node in nodesToKeep) { // Get the text, trim whitespace, and add a newline for readability string text = HtmlEntity.DeEntitize(node.InnerText).Trim(); if (!string.IsNullOrWhiteSpace(text)) { sb.AppendLine(text); sb.AppendLine(); // Adds spacing between paragraphs/headers } } return sb.ToString().Trim(); } catch (Exception e) { Console.WriteLine(e); throw; } } }