NewsArchival/NewsArchival.Scraper/Services/LatestService.cs

using AngleSharp;
using LexWells.Infrastructure.Common.Interfaces;
using NewsArchival.Core.Models;
using NewsArchival.Scraper.Validation;

namespace NewsArchival.Scraper.Services;

public class LatestService(
    ILogger<Worker> logger,
    ICacheService cache,
    IRobotsService robots,
    IHttpClientFactory httpClient,
    IBrowsingContext browsingContext)
    : ScraperService(logger, cache, robots, httpClient, browsingContext)
{
    public override async Task<HashSet<Article>> GetArticles(string hub)
    {
        return await GetLatestFromHub(hub);
    }

    private const string Untitled = "Untitled";

    private async Task<HashSet<Article>> GetLatestFromHub(string hub)
    {
        try
        {
            var cacheKey = $"hub:{hub.ToLower()}:latest";

            var cacheTask = _cache.GetAsync<HashSet<Article>>(cacheKey);
            var headlinesTask = GetCurrentHeadlines(hub);

            var cachedData = await cacheTask ?? new HashSet<Article>();
            var currentHeadlines = await headlinesTask;

            return currentHeadlines
                .Where(x => cachedData.All(c => c.Url != x.Url))
                .ToHashSet();
        }
        catch (Exception e)
        {
            Console.WriteLine(e);
            throw;
        }
    }

    private async Task<HashSet<Article>> GetCurrentHeadlines(string hub)
    {
        var articles = new HashSet<Article>();
        var hubUri = new Uri($"https://apnews.com/hub/{hub}");

        if (!await _robots.CanCrawlAsync(hubUri))
        {
            _logger.LogWarning("Robots.txt disallowed access to {Hub}", hubUri);
            return articles;
        }

        _logger.LogInformation("Scraping starting.");
        var client = _httpClient.CreateClient("LexWellsPoliteClient");

        try
        {
            var html = await client.GetStringAsync(hubUri);
            var document = await _browsingContext.OpenAsync(req => req.Content(html));
            var elements = document.QuerySelectorAll(".PageList-items-item");

            var seenUrls = new HashSet<string>();
            foreach (var element in elements.Take(50))
            {
                var linkElement = element.QuerySelector(".PagePromo-title a");
                if (linkElement == null) continue;

                var articleUrl = linkElement.GetAttribute("href");
                if (string.IsNullOrEmpty(articleUrl)) continue;

                var articleId = GenerateId(articleUrl);

                if (!seenUrls.Add(articleId)) continue;

                var cacheKey = $"processed_article:{articleId}";
                var isAlreadyProcessed = await _cache.GetAsync<bool>(cacheKey);

                if (isAlreadyProcessed)
                {
                    _logger.LogDebug("Skipping already processed article: {Id}", articleId);
                    continue;
                }

                var article = await ScrapeFullArticle(articleUrl, hub);
                if (article != null && !string.IsNullOrWhiteSpace(article.Title) && article.Title != Untitled)
                {
                    articles.Add(article);

                    await _cache.SetAsync(cacheKey, true, TimeSpan.FromDays(1));
                }

                var delay = await _robots.GetCrawlDelayAsync(hubUri);
                await Task.Delay(delay);
            }
        }
        catch (Exception e)
        {
            _logger.LogError(e, "Failed to scrape hub {Hub}", hub);
        }

        return articles;
    }

    private async Task<Article?> ScrapeFullArticle(string url, string hub)
    {
        var document = await _browsingContext.OpenAsync(url);

        var storyContent = document.QuerySelector("div.RichTextStoryBody.RichTextBody");
        if (storyContent == null) return null;

        return new Article
        {
            Id = GenerateId(url),
            Title = document.QuerySelector("h1.Page-headline")?.TextContent.Trim() ?? Untitled,
            Author = document.QuerySelector(".Page-authors a.Link")?.TextContent.Trim() ?? "Staff",
            Url = url,
            Category = hub,
            Content = Advertisement.Strip(storyContent.InnerHtml),
            IsWarZone = GetWarZoneStatus()
        };
    }

    private bool GetWarZoneStatus()
    {
        return false;
    }
}