131 lines
4.3 KiB
C#
131 lines
4.3 KiB
C#
using AngleSharp;
|
|
using LexWells.Infrastructure.Common.Interfaces;
|
|
using NewsArchival.Core.Models;
|
|
using NewsArchival.Scraper.Validation;
|
|
|
|
namespace NewsArchival.Scraper.Services;
|
|
|
|
public class LatestService(
|
|
ILogger<Worker> logger,
|
|
ICacheService cache,
|
|
IRobotsService robots,
|
|
IHttpClientFactory httpClient,
|
|
IBrowsingContext browsingContext)
|
|
: ScraperService(logger, cache, robots, httpClient, browsingContext)
|
|
{
|
|
public override async Task<HashSet<Article>> GetArticles(string hub)
|
|
{
|
|
return await GetLatestFromHub(hub);
|
|
}
|
|
|
|
private const string Untitled = "Untitled";
|
|
|
|
private async Task<HashSet<Article>> GetLatestFromHub(string hub)
|
|
{
|
|
try
|
|
{
|
|
var cacheKey = $"hub:{hub.ToLower()}:latest";
|
|
|
|
var cacheTask = _cache.GetAsync<HashSet<Article>>(cacheKey);
|
|
var headlinesTask = GetCurrentHeadlines(hub);
|
|
|
|
var cachedData = await cacheTask ?? new HashSet<Article>();
|
|
var currentHeadlines = await headlinesTask;
|
|
|
|
return currentHeadlines
|
|
.Where(x => cachedData.All(c => c.Url != x.Url))
|
|
.ToHashSet();
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
Console.WriteLine(e);
|
|
throw;
|
|
}
|
|
}
|
|
|
|
private async Task<HashSet<Article>> GetCurrentHeadlines(string hub)
|
|
{
|
|
var articles = new HashSet<Article>();
|
|
var hubUri = new Uri($"https://apnews.com/hub/{hub}");
|
|
|
|
if (!await _robots.CanCrawlAsync(hubUri))
|
|
{
|
|
_logger.LogWarning("Robots.txt disallowed access to {Hub}", hubUri);
|
|
return articles;
|
|
}
|
|
|
|
_logger.LogInformation("Scraping starting.");
|
|
var client = _httpClient.CreateClient("LexWellsPoliteClient");
|
|
|
|
try
|
|
{
|
|
var html = await client.GetStringAsync(hubUri);
|
|
var document = await _browsingContext.OpenAsync(req => req.Content(html));
|
|
var elements = document.QuerySelectorAll(".PageList-items-item");
|
|
|
|
var seenUrls = new HashSet<string>();
|
|
foreach (var element in elements.Take(50))
|
|
{
|
|
var linkElement = element.QuerySelector(".PagePromo-title a");
|
|
if (linkElement == null) continue;
|
|
|
|
var articleUrl = linkElement.GetAttribute("href");
|
|
if (string.IsNullOrEmpty(articleUrl)) continue;
|
|
|
|
var articleId = GenerateId(articleUrl);
|
|
|
|
if (!seenUrls.Add(articleId)) continue;
|
|
|
|
var cacheKey = $"processed_article:{articleId}";
|
|
var isAlreadyProcessed = await _cache.GetAsync<bool>(cacheKey);
|
|
|
|
if (isAlreadyProcessed)
|
|
{
|
|
_logger.LogDebug("Skipping already processed article: {Id}", articleId);
|
|
continue;
|
|
}
|
|
|
|
var article = await ScrapeFullArticle(articleUrl, hub);
|
|
if (article != null && !string.IsNullOrWhiteSpace(article.Title) && article.Title != Untitled)
|
|
{
|
|
articles.Add(article);
|
|
|
|
await _cache.SetAsync(cacheKey, true, TimeSpan.FromDays(1));
|
|
}
|
|
|
|
var delay = await _robots.GetCrawlDelayAsync(hubUri);
|
|
await Task.Delay(delay);
|
|
}
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
_logger.LogError(e, "Failed to scrape hub {Hub}", hub);
|
|
}
|
|
|
|
return articles;
|
|
}
|
|
|
|
private async Task<Article?> ScrapeFullArticle(string url, string hub)
|
|
{
|
|
var document = await _browsingContext.OpenAsync(url);
|
|
|
|
var storyContent = document.QuerySelector("div.RichTextStoryBody.RichTextBody");
|
|
if (storyContent == null) return null;
|
|
|
|
return new Article
|
|
{
|
|
Id = GenerateId(url),
|
|
Title = document.QuerySelector("h1.Page-headline")?.TextContent.Trim() ?? Untitled,
|
|
Author = document.QuerySelector(".Page-authors a.Link")?.TextContent.Trim() ?? "Staff",
|
|
Url = url,
|
|
Category = hub,
|
|
Content = Advertisement.Strip(storyContent.InnerHtml),
|
|
IsWarZone = GetWarZoneStatus()
|
|
};
|
|
}
|
|
|
|
private bool GetWarZoneStatus()
|
|
{
|
|
return false;
|
|
}
|
|
} |