43 lines
1.4 KiB
C#
43 lines
1.4 KiB
C#
using System.Security.Cryptography;
|
|
using System.Text;
|
|
using AngleSharp;
|
|
using LexWells.Infrastructure.Common.Interfaces;
|
|
using NewsArchival.Core.Interfaces;
|
|
using NewsArchival.Core.Models;
|
|
|
|
namespace NewsArchival.Scraper.Services;
|
|
|
|
public abstract class ScraperService(
|
|
ILogger<Worker> logger,
|
|
ICacheService cache,
|
|
IRobotsService robots,
|
|
IHttpClientFactory httpClient,
|
|
IBrowsingContext browsingContext)
|
|
: IScraperService
|
|
{
|
|
protected readonly ILogger<Worker> _logger = logger;
|
|
protected readonly ICacheService _cache = cache;
|
|
protected readonly IRobotsService _robots = robots;
|
|
protected readonly IHttpClientFactory _httpClient = httpClient;
|
|
protected readonly IBrowsingContext _browsingContext = browsingContext;
|
|
|
|
#region IScraperService
|
|
|
|
public abstract Task<HashSet<Article>> GetArticles(string hub);
|
|
|
|
#endregion
|
|
|
|
protected static string GenerateId(string url)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(url)) return "unknown";
|
|
|
|
var baseUrl = new Uri("https://apnews.com");
|
|
var uri = new Uri(baseUrl, url);
|
|
|
|
var cleanPath = uri.AbsolutePath.TrimEnd('/');
|
|
var normalizedUrl = $"{uri.Scheme}://{uri.Host}{cleanPath}".ToLowerInvariant();
|
|
|
|
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalizedUrl));
|
|
return Convert.ToHexString(bytes).ToLower();
|
|
}
|
|
} |