81 lines
3.0 KiB
C#
81 lines
3.0 KiB
C#
using System.Net.Http.Json;
|
|
using LexWells.Infrastructure.Common.Interfaces;
|
|
using NewsArchival.Core.Interfaces;
|
|
|
|
namespace NewsArchival.Scraper;
|
|
|
|
public class Worker(
|
|
ILogger<Worker> logger,
|
|
IConfiguration configuration,
|
|
IScraperService scraperService,
|
|
IRobotsService robotsService,
|
|
IHttpClientFactory httpClient) : BackgroundService
|
|
{
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
var apiClient = httpClient.CreateClient("NewsApi");
|
|
var hubs = configuration.GetSection("ScraperSettings:Hubs").Get<List<string>>() ?? [];
|
|
|
|
logger.LogInformation("Scraper Worker started at: {time}", DateTimeOffset.Now);
|
|
|
|
while (!stoppingToken.IsCancellationRequested)
|
|
{
|
|
var cycleStartTime = DateTime.UtcNow;
|
|
|
|
foreach (var hub in hubs)
|
|
{
|
|
try
|
|
{
|
|
logger.LogInformation("Processing hub: {Hub}", hub);
|
|
|
|
var articles = await scraperService.GetArticles(hub);
|
|
|
|
foreach (var article in articles)
|
|
{
|
|
if (await ArticleExistsAsync(apiClient, article.Id, stoppingToken)) continue;
|
|
|
|
var response = await apiClient.PostAsJsonAsync("api/articles", article, stoppingToken);
|
|
|
|
if (response.IsSuccessStatusCode)
|
|
{
|
|
logger.LogInformation("Successfully archived: {Title}", article.Title);
|
|
}
|
|
else
|
|
{
|
|
var error = await response.Content.ReadAsStringAsync(stoppingToken);
|
|
logger.LogError("API Rejected {Id}: {Status} - {Error}", article.Id, response.StatusCode, error);
|
|
}
|
|
}
|
|
|
|
var uri = new Uri("https://apnews.com");
|
|
var delayMs = await robotsService.GetCrawlDelayAsync(uri);
|
|
await Task.Delay(delayMs, stoppingToken);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
logger.LogError(ex, "Critical error processing hub {Hub}", hub);
|
|
}
|
|
}
|
|
|
|
await ApplyGlobalDelay(cycleStartTime, stoppingToken);
|
|
}
|
|
}
|
|
|
|
private async Task<bool> ArticleExistsAsync(HttpClient client, string id, CancellationToken ct)
|
|
{
|
|
var response = await client.GetAsync($"api/check?id={id}", ct);
|
|
return response.IsSuccessStatusCode;
|
|
}
|
|
|
|
private async Task ApplyGlobalDelay(DateTime startTime, CancellationToken ct)
|
|
{
|
|
var elapsed = DateTime.UtcNow - startTime;
|
|
var sleepTime = TimeSpan.FromHours(1) - elapsed;
|
|
|
|
if (sleepTime > TimeSpan.Zero)
|
|
{
|
|
logger.LogInformation("Cycle complete. Sleeping for {Minutes} minutes...", sleepTime.TotalMinutes);
|
|
await Task.Delay(sleepTime, ct);
|
|
}
|
|
}
|
|
} |