NewsArchival/NewsArchival.Scraper/Services/ScraperService.cs

43 lines
1.4 KiB
C#

using System.Security.Cryptography;
using System.Text;
using AngleSharp;
using LexWells.Infrastructure.Common.Interfaces;
using NewsArchival.Core.Interfaces;
using NewsArchival.Core.Models;
namespace NewsArchival.Scraper.Services;
public abstract class ScraperService(
ILogger<Worker> logger,
ICacheService cache,
IRobotsService robots,
IHttpClientFactory httpClient,
IBrowsingContext browsingContext)
: IScraperService
{
protected readonly ILogger<Worker> _logger = logger;
protected readonly ICacheService _cache = cache;
protected readonly IRobotsService _robots = robots;
protected readonly IHttpClientFactory _httpClient = httpClient;
protected readonly IBrowsingContext _browsingContext = browsingContext;
#region IScraperService
public abstract Task<HashSet<Article>> GetArticles(string hub);
#endregion
protected static string GenerateId(string url)
{
if (string.IsNullOrWhiteSpace(url)) return "unknown";
var baseUrl = new Uri("https://apnews.com");
var uri = new Uri(baseUrl, url);
var cleanPath = uri.AbsolutePath.TrimEnd('/');
var normalizedUrl = $"{uri.Scheme}://{uri.Host}{cleanPath}".ToLowerInvariant();
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalizedUrl));
return Convert.ToHexString(bytes).ToLower();
}
}