using AngleSharp; using LexWells.Infrastructure.Common.Interfaces; using NewsArchival.Core.Models; using NewsArchival.Scraper.Validation; namespace NewsArchival.Scraper.Services; public class LatestService( ILogger logger, ICacheService cache, IRobotsService robots, IHttpClientFactory httpClient, IBrowsingContext browsingContext) : ScraperService(logger, cache, robots, httpClient, browsingContext) { public override async Task> GetArticles(string hub) { return await GetLatestFromHub(hub); } private const string Untitled = "Untitled"; private async Task> GetLatestFromHub(string hub) { try { var cacheKey = $"hub:{hub.ToLower()}:latest"; var cacheTask = _cache.GetAsync>(cacheKey); var headlinesTask = GetCurrentHeadlines(hub); var cachedData = await cacheTask ?? new HashSet
(); var currentHeadlines = await headlinesTask; return currentHeadlines .Where(x => cachedData.All(c => c.Url != x.Url)) .ToHashSet(); } catch (Exception e) { Console.WriteLine(e); throw; } } private async Task> GetCurrentHeadlines(string hub) { var articles = new HashSet
(); var hubUri = new Uri($"https://apnews.com/hub/{hub}"); if (!await _robots.CanCrawlAsync(hubUri)) { _logger.LogWarning("Robots.txt disallowed access to {Hub}", hubUri); return articles; } _logger.LogInformation("Scraping starting."); var client = _httpClient.CreateClient("LexWellsPoliteClient"); try { var html = await client.GetStringAsync(hubUri); var document = await _browsingContext.OpenAsync(req => req.Content(html)); var elements = document.QuerySelectorAll(".PageList-items-item"); var seenUrls = new HashSet(); foreach (var element in elements.Take(50)) { var linkElement = element.QuerySelector(".PagePromo-title a"); if (linkElement == null) continue; var articleUrl = linkElement.GetAttribute("href"); if (string.IsNullOrEmpty(articleUrl)) continue; var articleId = GenerateId(articleUrl); if (!seenUrls.Add(articleId)) continue; var cacheKey = $"processed_article:{articleId}"; var isAlreadyProcessed = await _cache.GetAsync(cacheKey); if (isAlreadyProcessed) { _logger.LogDebug("Skipping already processed article: {Id}", articleId); continue; } var article = await ScrapeFullArticle(articleUrl, hub); if (article != null && !string.IsNullOrWhiteSpace(article.Title) && article.Title != Untitled) { articles.Add(article); await _cache.SetAsync(cacheKey, true, TimeSpan.FromDays(1)); } var delay = await _robots.GetCrawlDelayAsync(hubUri); await Task.Delay(delay); } } catch (Exception e) { _logger.LogError(e, "Failed to scrape hub {Hub}", hub); } return articles; } private async Task ScrapeFullArticle(string url, string hub) { var document = await _browsingContext.OpenAsync(url); var storyContent = document.QuerySelector("div.RichTextStoryBody.RichTextBody"); if (storyContent == null) return null; return new Article { Id = GenerateId(url), Title = document.QuerySelector("h1.Page-headline")?.TextContent.Trim() ?? Untitled, Author = document.QuerySelector(".Page-authors a.Link")?.TextContent.Trim() ?? "Staff", Url = url, Category = hub, Content = Advertisement.Strip(storyContent.InnerHtml), IsWarZone = GetWarZoneStatus() }; } private bool GetWarZoneStatus() { return false; } }