Initial commit
This commit is contained in:
commit
426cba1dd3
|
|
@ -0,0 +1,27 @@
|
|||
**/.dockerignore
|
||||
**/.env
|
||||
**/.git
|
||||
**/.gitignore
|
||||
**/.project
|
||||
**/.settings
|
||||
**/.toolstarget
|
||||
**/.vs
|
||||
**/.vscode
|
||||
**/.idea
|
||||
**/*.*proj.user
|
||||
**/*.db
|
||||
**/*.dbmdl
|
||||
**/*.jfm
|
||||
**/azds.yaml
|
||||
**/bin
|
||||
**/charts
|
||||
**/docker-compose*
|
||||
**/Dockerfile*
|
||||
**/node_modules
|
||||
**/npm-debug.log
|
||||
**/obj
|
||||
**/secrets.dev.yaml
|
||||
**/values.dev.yaml
|
||||
**/*.user
|
||||
LICENSE
|
||||
README.md
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
# IDEs
|
||||
.idea/
|
||||
.vs/
|
||||
.vscode/
|
||||
|
||||
# Build outputs
|
||||
bin/
|
||||
obj/
|
||||
dist/
|
||||
publish/
|
||||
|
||||
# Archives and Packages
|
||||
*.tar.gz
|
||||
*.zip
|
||||
*.nupkg
|
||||
|
||||
# Databases
|
||||
*.db
|
||||
*.db-shm
|
||||
*.db-wal
|
||||
|
||||
# User-specific settings
|
||||
*.sln.DotSettings.user
|
||||
appsettings*.json**/Properties/launchSettings.json
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
using Microsoft.AspNetCore.Mvc;
|
||||
using LexWells.Infrastructure.Common.Interfaces;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using NewsArchival.Api.Data;
|
||||
using NewsArchival.Core.Models;
|
||||
|
||||
namespace NewsArchival.Api.Controllers;
|
||||
|
||||
[ApiController]
|
||||
[Route("api/[controller]")]
|
||||
public class ArticlesController(AppDbContext db, ICacheService cache) : ControllerBase
|
||||
{
|
||||
[HttpPost]
|
||||
public async Task<IActionResult> PostArticle([FromBody] Article article)
|
||||
{
|
||||
if (!ModelState.IsValid) return BadRequest(ModelState);
|
||||
|
||||
db.Articles.Add(article);
|
||||
await db.SaveChangesAsync();
|
||||
|
||||
await cache.RemoveAsync($"{article.Category.ToLower()}:all_articles");
|
||||
|
||||
return CreatedAtAction(
|
||||
nameof(GetArticles),
|
||||
new { hub = article.Category.ToLower() },
|
||||
article);
|
||||
}
|
||||
|
||||
[HttpGet("{hub}")]
|
||||
public async Task<IActionResult> GetArticles(string hub)
|
||||
{
|
||||
var cacheKey = $"{hub.ToLower()}:all_articles";
|
||||
|
||||
var articles = await cache.GetAsync<List<Article>>(cacheKey);
|
||||
|
||||
if (articles == null)
|
||||
{
|
||||
articles = await db.Articles
|
||||
.Where(x => x.Category == hub)
|
||||
.OrderByDescending(a => a.CreatedAt)
|
||||
.ToListAsync();
|
||||
|
||||
await cache.SetAsync(cacheKey, articles, TimeSpan.FromHours(1));
|
||||
}
|
||||
|
||||
return Ok(articles);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
using LexWells.Infrastructure.EntityFramework;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using NewsArchival.Core.Models;
|
||||
|
||||
namespace NewsArchival.Api.Data;
|
||||
|
||||
public class AppDbContext(DbContextOptions<AppDbContext> options) : LexWellsDbContext(options)
|
||||
{
|
||||
public DbSet<Article> Articles => Set<Article>();
|
||||
|
||||
protected override void OnModelCreating(ModelBuilder modelBuilder)
|
||||
{
|
||||
base.OnModelCreating(modelBuilder);
|
||||
|
||||
// Ensure we don't save the same URL twice
|
||||
modelBuilder.Entity<Article>()
|
||||
.HasIndex(a => a.Url)
|
||||
.IsUnique();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
FROM mcr.microsoft.com/dotnet/sdk:9.0-alpine AS build
|
||||
WORKDIR /src
|
||||
|
||||
COPY NewsArchive.sln ./
|
||||
COPY nuget.config ./
|
||||
COPY LocalNuGet/ ./LocalNuGet/
|
||||
COPY NewsArchive.Api/*.csproj ./NewsArchive.Api/
|
||||
COPY NewsArchive.Scraper/*.csproj ./NewsArchive.Scraper/
|
||||
COPY NewsArchive.Core/*.csproj ./NewsArchive.Core/
|
||||
COPY NewsArchive.UI/*.csproj ./NewsArchive.UI/
|
||||
|
||||
RUN dotnet restore
|
||||
|
||||
COPY . .
|
||||
|
||||
WORKDIR "/src/NewsArchive.Api"
|
||||
RUN dotnet publish "NewsArchive.Api.csproj" -c Release -o /app/publish --no-restore
|
||||
|
||||
FROM mcr.microsoft.com/dotnet/aspnet:9.0-alpine AS final
|
||||
WORKDIR /app
|
||||
RUN mkdir -p /app/data && chown -R 1000:1000 /app/data
|
||||
COPY --from=build /app/publish .
|
||||
ENTRYPOINT ["dotnet", "NewsArchive.Api.dll"]
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk.Web">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net9.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<InvariantGlobalization>true</InvariantGlobalization>
|
||||
<PublishAot>true</PublishAot>
|
||||
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Content Include="..\.dockerignore">
|
||||
<Link>.dockerignore</Link>
|
||||
</Content>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="HtmlAgilityPack" Version="1.12.4" />
|
||||
<PackageReference Include="LexWells.Infrastructure.Common" Version="[1.0.0]" />
|
||||
<PackageReference Include="LexWells.Infrastructure.EntityFramework" Version="[1.0.0]" />
|
||||
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="9.0.14" />
|
||||
<PackageReference Include="Microsoft.EntityFrameworkCore" Version="9.0.14" />
|
||||
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" Version="9.0.14">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||
</PackageReference>
|
||||
<PackageReference Include="Microsoft.EntityFrameworkCore.Sqlite" Version="9.0.14" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\NewsArchival.Core\NewsArchival.Core.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
@NewsArchival.Api_HostAddress = http://localhost:5297
|
||||
|
||||
GET {{NewsArchival.Api_HostAddress}}/todos/
|
||||
Accept: application/json
|
||||
|
||||
###
|
||||
|
||||
GET {{NewsArchival.Api_HostAddress}}/todos/1
|
||||
Accept: application/json
|
||||
|
||||
###
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
using LexWells.Infrastructure.Common;
|
||||
using LexWells.Infrastructure.EntityFramework;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using NewsArchival.Api.Data;
|
||||
|
||||
var builder = WebApplication.CreateBuilder(args);
|
||||
|
||||
// 1. Infrastructure First (Redis + HttpClient + Robots)
|
||||
builder.Services.AddLexWellsInfrastructure(
|
||||
builder.Configuration.GetConnectionString("Redis") ?? "localhost:6379");
|
||||
|
||||
// 2. Database Support
|
||||
builder.Services.AddLexWellsDatabase<AppDbContext>(
|
||||
builder.Configuration.GetConnectionString("DefaultConnection") ?? "Data Source=news.db");
|
||||
|
||||
builder.Services.AddControllers();
|
||||
builder.Services.AddOpenApi();
|
||||
|
||||
var app = builder.Build();
|
||||
|
||||
using (var scope = app.Services.CreateScope())
|
||||
{
|
||||
var context = scope.ServiceProvider.GetRequiredService<AppDbContext>();
|
||||
|
||||
context.Database.Migrate();
|
||||
}
|
||||
|
||||
if (app.Environment.IsDevelopment())
|
||||
{
|
||||
app.MapOpenApi();
|
||||
}
|
||||
|
||||
app.UseHttpsRedirection();
|
||||
|
||||
app.MapControllers();
|
||||
|
||||
app.MapGet("/api/check", async (string id, AppDbContext db) =>
|
||||
await db.Articles.AnyAsync(a => a.Id == id)
|
||||
? Results.Ok(new { exists = true })
|
||||
: Results.NotFound(new { exists = false }));
|
||||
|
||||
app.Run();
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
using NewsArchival.Core.Models;
|
||||
|
||||
namespace NewsArchival.Core.Interfaces;
|
||||
|
||||
public interface IScraperService
|
||||
{
|
||||
abstract Task<HashSet<Article>> GetArticles(string hub);
|
||||
}
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
using LexWells.Infrastructure.EntityFramework.Entities;
|
||||
|
||||
namespace NewsArchival.Core.Models;
|
||||
|
||||
public class Article : BaseEntity
|
||||
{
|
||||
public required string Title { get; set; }
|
||||
public required string Author { get; set; }
|
||||
public required string Url { get; set; }
|
||||
public required string Category { get; set; }
|
||||
public bool SavePictures => IsWarZone;
|
||||
public bool IsWarZone { get; set; }
|
||||
public required string Content { get; set; }
|
||||
}
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net9.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="LexWells.Infrastructure.Common" Version="[1.0.0]" />
|
||||
<PackageReference Include="LexWells.Infrastructure.EntityFramework" Version="[1.0.0]" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
FROM mcr.microsoft.com/dotnet/sdk:9.0-alpine AS build
|
||||
WORKDIR /src
|
||||
|
||||
# Copy the solution and ALL project files first
|
||||
COPY NewsArchive.sln ./
|
||||
COPY nuget.config ./
|
||||
COPY LocalNuGet/ ./LocalNuGet/
|
||||
COPY NewsArchive.Api/*.csproj ./NewsArchive.Api/
|
||||
COPY NewsArchive.Scraper/*.csproj ./NewsArchive.Scraper/
|
||||
COPY NewsArchive.Core/*.csproj ./NewsArchive.Core/
|
||||
COPY NewsArchive.UI/*.csproj ./NewsArchive.UI/
|
||||
|
||||
# Restore the whole solution - this fixes the missing HtmlAgilityPack/AngleSharp issue
|
||||
RUN dotnet restore
|
||||
|
||||
# Now copy the actual source code
|
||||
COPY . .
|
||||
|
||||
# Build and Publish the Scraper
|
||||
WORKDIR "/src/NewsArchive.Scraper"
|
||||
RUN dotnet publish "NewsArchive.Scraper.csproj" -c Release -o /app/publish --no-restore
|
||||
|
||||
FROM mcr.microsoft.com/dotnet/aspnet:9.0-alpine AS final
|
||||
WORKDIR /app
|
||||
COPY --from=build /app/publish .
|
||||
ENTRYPOINT ["dotnet", "NewsArchive.Scraper.dll"]
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
namespace NewsArchival.Scraper.Models;
|
||||
|
||||
public class ScraperSettings
|
||||
{
|
||||
public HashSet<string> Hubs { get; set; } = new();
|
||||
public int IntervalMinutes { get; set; }
|
||||
}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk.Worker">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net9.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<UserSecretsId>dotnet-NewsArchival.Scraper-9d6e498a-92fd-49c4-863a-c7d529854490</UserSecretsId>
|
||||
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="AngleSharp" Version="1.4.0" />
|
||||
<PackageReference Include="HtmlAgilityPack" Version="1.12.4" />
|
||||
<PackageReference Include="Microsoft.Extensions.Caching.StackExchangeRedis" Version="9.0.14" />
|
||||
<PackageReference Include="Microsoft.Extensions.Hosting" Version="9.0.14"/>
|
||||
<PackageReference Include="LexWells.Infrastructure.EntityFramework" Version="[1.0.0]" />
|
||||
<PackageReference Include="LexWells.Infrastructure.Common" Version="[1.0.0]" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" Version="9.0.14" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Content Include="..\.dockerignore">
|
||||
<Link>.dockerignore</Link>
|
||||
</Content>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\NewsArchival.Api\NewsArchival.Api.csproj" />
|
||||
<ProjectReference Include="..\NewsArchival.Core\NewsArchival.Core.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
using AngleSharp;
|
||||
using LexWells.Infrastructure.Common;
|
||||
using LexWells.Infrastructure.EntityFramework;
|
||||
using NewsArchival.Api.Data;
|
||||
using NewsArchival.Core.Interfaces;
|
||||
using NewsArchival.Scraper;
|
||||
using NewsArchival.Scraper.Services;
|
||||
|
||||
var builder = Host.CreateApplicationBuilder(args);
|
||||
|
||||
builder.Services.AddLexWellsInfrastructure(
|
||||
builder.Configuration["ConnectionStrings:Redis"] ?? "localhost:6379");
|
||||
|
||||
builder.Services.AddLexWellsDatabase<AppDbContext>(
|
||||
builder.Configuration["ConnectionStrings:DefaultConnection"] ?? "Data Source=newsarchive.db");
|
||||
|
||||
builder.Services.AddHttpClient("NewsApi", client =>
|
||||
{
|
||||
client.BaseAddress = new Uri(builder.Configuration["ScraperSettings:BaseUrl"] ?? "http://localhost:5000");
|
||||
});
|
||||
|
||||
builder.Services.AddSingleton(BrowsingContext.New(Configuration.Default.WithDefaultLoader()));
|
||||
|
||||
builder.Services.AddSingleton<IScraperService, LatestService>();
|
||||
|
||||
builder.Services.AddHostedService<Worker>();
|
||||
|
||||
var host = builder.Build();
|
||||
await host.RunAsync();
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
using AngleSharp;
|
||||
using LexWells.Infrastructure.Common.Interfaces;
|
||||
using NewsArchival.Core.Models;
|
||||
using NewsArchival.Scraper.Validation;
|
||||
|
||||
namespace NewsArchival.Scraper.Services;
|
||||
|
||||
public class LatestService(
|
||||
ILogger<Worker> logger,
|
||||
ICacheService cache,
|
||||
IRobotsService robots,
|
||||
IHttpClientFactory httpClient,
|
||||
IBrowsingContext browsingContext)
|
||||
: ScraperService(logger, cache, robots, httpClient, browsingContext)
|
||||
{
|
||||
public override async Task<HashSet<Article>> GetArticles(string hub)
|
||||
{
|
||||
return await GetLatestFromHub(hub);
|
||||
}
|
||||
|
||||
private const string Untitled = "Untitled";
|
||||
|
||||
private async Task<HashSet<Article>> GetLatestFromHub(string hub)
|
||||
{
|
||||
try
|
||||
{
|
||||
var cacheKey = $"hub:{hub.ToLower()}:latest";
|
||||
|
||||
var cacheTask = _cache.GetAsync<HashSet<Article>>(cacheKey);
|
||||
var headlinesTask = GetCurrentHeadlines(hub);
|
||||
|
||||
var cachedData = await cacheTask ?? new HashSet<Article>();
|
||||
var currentHeadlines = await headlinesTask;
|
||||
|
||||
return currentHeadlines
|
||||
.Where(x => cachedData.All(c => c.Url != x.Url))
|
||||
.ToHashSet();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
Console.WriteLine(e);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<HashSet<Article>> GetCurrentHeadlines(string hub)
|
||||
{
|
||||
var articles = new HashSet<Article>();
|
||||
var hubUri = new Uri($"https://apnews.com/hub/{hub}");
|
||||
|
||||
if (!await _robots.CanCrawlAsync(hubUri))
|
||||
{
|
||||
_logger.LogWarning("Robots.txt disallowed access to {Hub}", hubUri);
|
||||
return articles;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Scraping starting.");
|
||||
var client = _httpClient.CreateClient("LexWellsPoliteClient");
|
||||
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(hubUri);
|
||||
var document = await _browsingContext.OpenAsync(req => req.Content(html));
|
||||
var elements = document.QuerySelectorAll(".PageList-items-item");
|
||||
|
||||
var seenUrls = new HashSet<string>();
|
||||
foreach (var element in elements.Take(50))
|
||||
{
|
||||
var linkElement = element.QuerySelector(".PagePromo-title a");
|
||||
if (linkElement == null) continue;
|
||||
|
||||
var articleUrl = linkElement.GetAttribute("href");
|
||||
if (string.IsNullOrEmpty(articleUrl)) continue;
|
||||
|
||||
var articleId = GenerateId(articleUrl);
|
||||
|
||||
if (!seenUrls.Add(articleId)) continue;
|
||||
|
||||
var cacheKey = $"processed_article:{articleId}";
|
||||
var isAlreadyProcessed = await _cache.GetAsync<bool>(cacheKey);
|
||||
|
||||
if (isAlreadyProcessed)
|
||||
{
|
||||
_logger.LogDebug("Skipping already processed article: {Id}", articleId);
|
||||
continue;
|
||||
}
|
||||
|
||||
var article = await ScrapeFullArticle(articleUrl, hub);
|
||||
if (article != null && !string.IsNullOrWhiteSpace(article.Title) && article.Title != Untitled)
|
||||
{
|
||||
articles.Add(article);
|
||||
|
||||
await _cache.SetAsync(cacheKey, true, TimeSpan.FromDays(1));
|
||||
}
|
||||
|
||||
var delay = await _robots.GetCrawlDelayAsync(hubUri);
|
||||
await Task.Delay(delay);
|
||||
}
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
_logger.LogError(e, "Failed to scrape hub {Hub}", hub);
|
||||
}
|
||||
|
||||
return articles;
|
||||
}
|
||||
|
||||
private async Task<Article?> ScrapeFullArticle(string url, string hub)
|
||||
{
|
||||
var document = await _browsingContext.OpenAsync(url);
|
||||
|
||||
var storyContent = document.QuerySelector("div.RichTextStoryBody.RichTextBody");
|
||||
if (storyContent == null) return null;
|
||||
|
||||
return new Article
|
||||
{
|
||||
Id = GenerateId(url),
|
||||
Title = document.QuerySelector("h1.Page-headline")?.TextContent.Trim() ?? Untitled,
|
||||
Author = document.QuerySelector(".Page-authors a.Link")?.TextContent.Trim() ?? "Staff",
|
||||
Url = url,
|
||||
Category = hub,
|
||||
Content = Advertisement.Strip(storyContent.InnerHtml),
|
||||
IsWarZone = GetWarZoneStatus()
|
||||
};
|
||||
}
|
||||
|
||||
private bool GetWarZoneStatus()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using AngleSharp;
|
||||
using LexWells.Infrastructure.Common.Interfaces;
|
||||
using NewsArchival.Core.Interfaces;
|
||||
using NewsArchival.Core.Models;
|
||||
|
||||
namespace NewsArchival.Scraper.Services;
|
||||
|
||||
public abstract class ScraperService(
|
||||
ILogger<Worker> logger,
|
||||
ICacheService cache,
|
||||
IRobotsService robots,
|
||||
IHttpClientFactory httpClient,
|
||||
IBrowsingContext browsingContext)
|
||||
: IScraperService
|
||||
{
|
||||
protected readonly ILogger<Worker> _logger = logger;
|
||||
protected readonly ICacheService _cache = cache;
|
||||
protected readonly IRobotsService _robots = robots;
|
||||
protected readonly IHttpClientFactory _httpClient = httpClient;
|
||||
protected readonly IBrowsingContext _browsingContext = browsingContext;
|
||||
|
||||
#region IScraperService
|
||||
|
||||
public abstract Task<HashSet<Article>> GetArticles(string hub);
|
||||
|
||||
#endregion
|
||||
|
||||
protected static string GenerateId(string url)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(url)) return "unknown";
|
||||
|
||||
var baseUrl = new Uri("https://apnews.com");
|
||||
var uri = new Uri(baseUrl, url);
|
||||
|
||||
var cleanPath = uri.AbsolutePath.TrimEnd('/');
|
||||
var normalizedUrl = $"{uri.Scheme}://{uri.Host}{cleanPath}".ToLowerInvariant();
|
||||
|
||||
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalizedUrl));
|
||||
return Convert.ToHexString(bytes).ToLower();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
using System.Text;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace NewsArchival.Scraper.Validation;
|
||||
|
||||
public class Advertisement
|
||||
{
|
||||
public static string Strip(string htmlContent)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(htmlContent)) return htmlContent;
|
||||
|
||||
var doc = new HtmlDocument();
|
||||
doc.LoadHtml(htmlContent);
|
||||
|
||||
var xpath = "//p | //a | //*[starts-with(name(), 'h') and string-length(name()) = 2]";
|
||||
var nodesToKeep = doc.DocumentNode.SelectNodes(xpath);
|
||||
if (nodesToKeep is null) return string.Empty;
|
||||
|
||||
var sb = new StringBuilder();
|
||||
foreach (var node in nodesToKeep)
|
||||
{
|
||||
// Get the text, trim whitespace, and add a newline for readability
|
||||
string text = HtmlEntity.DeEntitize(node.InnerText).Trim();
|
||||
if (!string.IsNullOrWhiteSpace(text))
|
||||
{
|
||||
sb.AppendLine(text);
|
||||
sb.AppendLine(); // Adds spacing between paragraphs/headers
|
||||
}
|
||||
}
|
||||
|
||||
return sb.ToString().Trim();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
Console.WriteLine(e);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
using System.Net.Http.Json;
|
||||
using LexWells.Infrastructure.Common.Interfaces;
|
||||
using NewsArchival.Core.Interfaces;
|
||||
|
||||
namespace NewsArchival.Scraper;
|
||||
|
||||
public class Worker(
|
||||
ILogger<Worker> logger,
|
||||
IConfiguration configuration,
|
||||
IScraperService scraperService,
|
||||
IRobotsService robotsService,
|
||||
IHttpClientFactory httpClient) : BackgroundService
|
||||
{
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
var apiClient = httpClient.CreateClient("NewsApi");
|
||||
var hubs = configuration.GetSection("ScraperSettings:Hubs").Get<List<string>>() ?? [];
|
||||
|
||||
logger.LogInformation("Scraper Worker started at: {time}", DateTimeOffset.Now);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
var cycleStartTime = DateTime.UtcNow;
|
||||
|
||||
foreach (var hub in hubs)
|
||||
{
|
||||
try
|
||||
{
|
||||
logger.LogInformation("Processing hub: {Hub}", hub);
|
||||
|
||||
var articles = await scraperService.GetArticles(hub);
|
||||
|
||||
foreach (var article in articles)
|
||||
{
|
||||
if (await ArticleExistsAsync(apiClient, article.Id, stoppingToken)) continue;
|
||||
|
||||
var response = await apiClient.PostAsJsonAsync("api/articles", article, stoppingToken);
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
logger.LogInformation("Successfully archived: {Title}", article.Title);
|
||||
}
|
||||
else
|
||||
{
|
||||
var error = await response.Content.ReadAsStringAsync(stoppingToken);
|
||||
logger.LogError("API Rejected {Id}: {Status} - {Error}", article.Id, response.StatusCode, error);
|
||||
}
|
||||
}
|
||||
|
||||
var uri = new Uri("https://apnews.com");
|
||||
var delayMs = await robotsService.GetCrawlDelayAsync(uri);
|
||||
await Task.Delay(delayMs, stoppingToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
logger.LogError(ex, "Critical error processing hub {Hub}", hub);
|
||||
}
|
||||
}
|
||||
|
||||
await ApplyGlobalDelay(cycleStartTime, stoppingToken);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<bool> ArticleExistsAsync(HttpClient client, string id, CancellationToken ct)
|
||||
{
|
||||
var response = await client.GetAsync($"api/check?id={id}", ct);
|
||||
return response.IsSuccessStatusCode;
|
||||
}
|
||||
|
||||
private async Task ApplyGlobalDelay(DateTime startTime, CancellationToken ct)
|
||||
{
|
||||
var elapsed = DateTime.UtcNow - startTime;
|
||||
var sleepTime = TimeSpan.FromHours(1) - elapsed;
|
||||
|
||||
if (sleepTime > TimeSpan.Zero)
|
||||
{
|
||||
logger.LogInformation("Cycle complete. Sleeping for {Minutes} minutes...", sleepTime.TotalMinutes);
|
||||
await Task.Delay(sleepTime, ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NewsArchival.Scraper", "NewsArchival.Scraper\NewsArchival.Scraper.csproj", "{D0A4FF28-F8C4-4897-9323-13A343A424FE}"
|
||||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{C85ACB1A-438D-4CD5-AA25-D8A854B7B6DA}"
|
||||
ProjectSection(SolutionItems) = preProject
|
||||
compose.yaml = compose.yaml
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NewsArchival.Api", "NewsArchival.Api\NewsArchival.Api.csproj", "{F8795C4C-3F3A-41A9-923C-F5013762F36F}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NewsArchival.Core", "NewsArchival.Core\NewsArchival.Core.csproj", "{D4FFE4CB-834B-46AC-83DD-CB5035E04403}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
Release|Any CPU = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{D0A4FF28-F8C4-4897-9323-13A343A424FE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{D0A4FF28-F8C4-4897-9323-13A343A424FE}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{D0A4FF28-F8C4-4897-9323-13A343A424FE}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{D0A4FF28-F8C4-4897-9323-13A343A424FE}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{F8795C4C-3F3A-41A9-923C-F5013762F36F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{F8795C4C-3F3A-41A9-923C-F5013762F36F}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{F8795C4C-3F3A-41A9-923C-F5013762F36F}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{F8795C4C-3F3A-41A9-923C-F5013762F36F}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
EndGlobal
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
services:
|
||||
newsarchival.scraper:
|
||||
image: newsarchival.scraper
|
||||
build:
|
||||
context: .
|
||||
dockerfile: NewsArchival.Scraper/Dockerfile
|
||||
|
||||
Loading…
Reference in New Issue