NewsArchival/NewsArchival.Scraper/Services/LatestService.cs

131 lines
4.3 KiB
C#

using AngleSharp;
using LexWells.Infrastructure.Common.Interfaces;
using NewsArchival.Core.Models;
using NewsArchival.Scraper.Validation;
namespace NewsArchival.Scraper.Services;
public class LatestService(
ILogger<Worker> logger,
ICacheService cache,
IRobotsService robots,
IHttpClientFactory httpClient,
IBrowsingContext browsingContext)
: ScraperService(logger, cache, robots, httpClient, browsingContext)
{
public override async Task<HashSet<Article>> GetArticles(string hub)
{
return await GetLatestFromHub(hub);
}
private const string Untitled = "Untitled";
private async Task<HashSet<Article>> GetLatestFromHub(string hub)
{
try
{
var cacheKey = $"hub:{hub.ToLower()}:latest";
var cacheTask = _cache.GetAsync<HashSet<Article>>(cacheKey);
var headlinesTask = GetCurrentHeadlines(hub);
var cachedData = await cacheTask ?? new HashSet<Article>();
var currentHeadlines = await headlinesTask;
return currentHeadlines
.Where(x => cachedData.All(c => c.Url != x.Url))
.ToHashSet();
}
catch (Exception e)
{
Console.WriteLine(e);
throw;
}
}
private async Task<HashSet<Article>> GetCurrentHeadlines(string hub)
{
var articles = new HashSet<Article>();
var hubUri = new Uri($"https://apnews.com/hub/{hub}");
if (!await _robots.CanCrawlAsync(hubUri))
{
_logger.LogWarning("Robots.txt disallowed access to {Hub}", hubUri);
return articles;
}
_logger.LogInformation("Scraping starting.");
var client = _httpClient.CreateClient("LexWellsPoliteClient");
try
{
var html = await client.GetStringAsync(hubUri);
var document = await _browsingContext.OpenAsync(req => req.Content(html));
var elements = document.QuerySelectorAll(".PageList-items-item");
var seenUrls = new HashSet<string>();
foreach (var element in elements.Take(50))
{
var linkElement = element.QuerySelector(".PagePromo-title a");
if (linkElement == null) continue;
var articleUrl = linkElement.GetAttribute("href");
if (string.IsNullOrEmpty(articleUrl)) continue;
var articleId = GenerateId(articleUrl);
if (!seenUrls.Add(articleId)) continue;
var cacheKey = $"processed_article:{articleId}";
var isAlreadyProcessed = await _cache.GetAsync<bool>(cacheKey);
if (isAlreadyProcessed)
{
_logger.LogDebug("Skipping already processed article: {Id}", articleId);
continue;
}
var article = await ScrapeFullArticle(articleUrl, hub);
if (article != null && !string.IsNullOrWhiteSpace(article.Title) && article.Title != Untitled)
{
articles.Add(article);
await _cache.SetAsync(cacheKey, true, TimeSpan.FromDays(1));
}
var delay = await _robots.GetCrawlDelayAsync(hubUri);
await Task.Delay(delay);
}
}
catch (Exception e)
{
_logger.LogError(e, "Failed to scrape hub {Hub}", hub);
}
return articles;
}
private async Task<Article?> ScrapeFullArticle(string url, string hub)
{
var document = await _browsingContext.OpenAsync(url);
var storyContent = document.QuerySelector("div.RichTextStoryBody.RichTextBody");
if (storyContent == null) return null;
return new Article
{
Id = GenerateId(url),
Title = document.QuerySelector("h1.Page-headline")?.TextContent.Trim() ?? Untitled,
Author = document.QuerySelector(".Page-authors a.Link")?.TextContent.Trim() ?? "Staff",
Url = url,
Category = hub,
Content = Advertisement.Strip(storyContent.InnerHtml),
IsWarZone = GetWarZoneStatus()
};
}
private bool GetWarZoneStatus()
{
return false;
}
}