NewsArchival/NewsArchival.Scraper/Worker.cs

81 lines
3.0 KiB
C#

using System.Net.Http.Json;
using LexWells.Infrastructure.Common.Interfaces;
using NewsArchival.Core.Interfaces;
namespace NewsArchival.Scraper;
public class Worker(
ILogger<Worker> logger,
IConfiguration configuration,
IScraperService scraperService,
IRobotsService robotsService,
IHttpClientFactory httpClient) : BackgroundService
{
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
var apiClient = httpClient.CreateClient("NewsApi");
var hubs = configuration.GetSection("ScraperSettings:Hubs").Get<List<string>>() ?? [];
logger.LogInformation("Scraper Worker started at: {time}", DateTimeOffset.Now);
while (!stoppingToken.IsCancellationRequested)
{
var cycleStartTime = DateTime.UtcNow;
foreach (var hub in hubs)
{
try
{
logger.LogInformation("Processing hub: {Hub}", hub);
var articles = await scraperService.GetArticles(hub);
foreach (var article in articles)
{
if (await ArticleExistsAsync(apiClient, article.Id, stoppingToken)) continue;
var response = await apiClient.PostAsJsonAsync("api/articles", article, stoppingToken);
if (response.IsSuccessStatusCode)
{
logger.LogInformation("Successfully archived: {Title}", article.Title);
}
else
{
var error = await response.Content.ReadAsStringAsync(stoppingToken);
logger.LogError("API Rejected {Id}: {Status} - {Error}", article.Id, response.StatusCode, error);
}
}
var uri = new Uri("https://apnews.com");
var delayMs = await robotsService.GetCrawlDelayAsync(uri);
await Task.Delay(delayMs, stoppingToken);
}
catch (Exception ex)
{
logger.LogError(ex, "Critical error processing hub {Hub}", hub);
}
}
await ApplyGlobalDelay(cycleStartTime, stoppingToken);
}
}
private async Task<bool> ArticleExistsAsync(HttpClient client, string id, CancellationToken ct)
{
var response = await client.GetAsync($"api/check?id={id}", ct);
return response.IsSuccessStatusCode;
}
private async Task ApplyGlobalDelay(DateTime startTime, CancellationToken ct)
{
var elapsed = DateTime.UtcNow - startTime;
var sleepTime = TimeSpan.FromHours(1) - elapsed;
if (sleepTime > TimeSpan.Zero)
{
logger.LogInformation("Cycle complete. Sleeping for {Minutes} minutes...", sleepTime.TotalMinutes);
await Task.Delay(sleepTime, ct);
}
}
}