Initial commit

This commit is contained in:
drew 2026-04-01 23:00:49 -04:00
commit 426cba1dd3
22 changed files with 695 additions and 0 deletions

27
.dockerignore Normal file
View File

@ -0,0 +1,27 @@
**/.dockerignore
**/.env
**/.git
**/.gitignore
**/.project
**/.settings
**/.toolstarget
**/.vs
**/.vscode
**/.idea
**/*.*proj.user
**/*.db
**/*.dbmdl
**/*.jfm
**/azds.yaml
**/bin
**/charts
**/docker-compose*
**/Dockerfile*
**/node_modules
**/npm-debug.log
**/obj
**/secrets.dev.yaml
**/values.dev.yaml
**/*.user
LICENSE
README.md

24
.gitignore vendored Normal file
View File

@ -0,0 +1,24 @@
# IDEs
.idea/
.vs/
.vscode/
# Build outputs
bin/
obj/
dist/
publish/
# Archives and Packages
*.tar.gz
*.zip
*.nupkg
# Databases
*.db
*.db-shm
*.db-wal
# User-specific settings
*.sln.DotSettings.user
appsettings*.json**/Properties/launchSettings.json

View File

@ -0,0 +1,48 @@
using Microsoft.AspNetCore.Mvc;
using LexWells.Infrastructure.Common.Interfaces;
using Microsoft.EntityFrameworkCore;
using NewsArchival.Api.Data;
using NewsArchival.Core.Models;
namespace NewsArchival.Api.Controllers;
[ApiController]
[Route("api/[controller]")]
public class ArticlesController(AppDbContext db, ICacheService cache) : ControllerBase
{
[HttpPost]
public async Task<IActionResult> PostArticle([FromBody] Article article)
{
if (!ModelState.IsValid) return BadRequest(ModelState);
db.Articles.Add(article);
await db.SaveChangesAsync();
await cache.RemoveAsync($"{article.Category.ToLower()}:all_articles");
return CreatedAtAction(
nameof(GetArticles),
new { hub = article.Category.ToLower() },
article);
}
[HttpGet("{hub}")]
public async Task<IActionResult> GetArticles(string hub)
{
var cacheKey = $"{hub.ToLower()}:all_articles";
var articles = await cache.GetAsync<List<Article>>(cacheKey);
if (articles == null)
{
articles = await db.Articles
.Where(x => x.Category == hub)
.OrderByDescending(a => a.CreatedAt)
.ToListAsync();
await cache.SetAsync(cacheKey, articles, TimeSpan.FromHours(1));
}
return Ok(articles);
}
}

View File

@ -0,0 +1,20 @@
using LexWells.Infrastructure.EntityFramework;
using Microsoft.EntityFrameworkCore;
using NewsArchival.Core.Models;
namespace NewsArchival.Api.Data;
public class AppDbContext(DbContextOptions<AppDbContext> options) : LexWellsDbContext(options)
{
public DbSet<Article> Articles => Set<Article>();
protected override void OnModelCreating(ModelBuilder modelBuilder)
{
base.OnModelCreating(modelBuilder);
// Ensure we don't save the same URL twice
modelBuilder.Entity<Article>()
.HasIndex(a => a.Url)
.IsUnique();
}
}

View File

@ -0,0 +1,23 @@
FROM mcr.microsoft.com/dotnet/sdk:9.0-alpine AS build
WORKDIR /src
COPY NewsArchive.sln ./
COPY nuget.config ./
COPY LocalNuGet/ ./LocalNuGet/
COPY NewsArchive.Api/*.csproj ./NewsArchive.Api/
COPY NewsArchive.Scraper/*.csproj ./NewsArchive.Scraper/
COPY NewsArchive.Core/*.csproj ./NewsArchive.Core/
COPY NewsArchive.UI/*.csproj ./NewsArchive.UI/
RUN dotnet restore
COPY . .
WORKDIR "/src/NewsArchive.Api"
RUN dotnet publish "NewsArchive.Api.csproj" -c Release -o /app/publish --no-restore
FROM mcr.microsoft.com/dotnet/aspnet:9.0-alpine AS final
WORKDIR /app
RUN mkdir -p /app/data && chown -R 1000:1000 /app/data
COPY --from=build /app/publish .
ENTRYPOINT ["dotnet", "NewsArchive.Api.dll"]

View File

@ -0,0 +1,35 @@
<Project Sdk="Microsoft.NET.Sdk.Web">
<PropertyGroup>
<TargetFramework>net9.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<InvariantGlobalization>true</InvariantGlobalization>
<PublishAot>true</PublishAot>
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
</PropertyGroup>
<ItemGroup>
<Content Include="..\.dockerignore">
<Link>.dockerignore</Link>
</Content>
</ItemGroup>
<ItemGroup>
<PackageReference Include="HtmlAgilityPack" Version="1.12.4" />
<PackageReference Include="LexWells.Infrastructure.Common" Version="[1.0.0]" />
<PackageReference Include="LexWells.Infrastructure.EntityFramework" Version="[1.0.0]" />
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="9.0.14" />
<PackageReference Include="Microsoft.EntityFrameworkCore" Version="9.0.14" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" Version="9.0.14">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="Microsoft.EntityFrameworkCore.Sqlite" Version="9.0.14" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\NewsArchival.Core\NewsArchival.Core.csproj" />
</ItemGroup>
</Project>

View File

@ -0,0 +1,11 @@
@NewsArchival.Api_HostAddress = http://localhost:5297
GET {{NewsArchival.Api_HostAddress}}/todos/
Accept: application/json
###
GET {{NewsArchival.Api_HostAddress}}/todos/1
Accept: application/json
###

View File

@ -0,0 +1,42 @@
using LexWells.Infrastructure.Common;
using LexWells.Infrastructure.EntityFramework;
using Microsoft.EntityFrameworkCore;
using NewsArchival.Api.Data;
var builder = WebApplication.CreateBuilder(args);
// 1. Infrastructure First (Redis + HttpClient + Robots)
builder.Services.AddLexWellsInfrastructure(
builder.Configuration.GetConnectionString("Redis") ?? "localhost:6379");
// 2. Database Support
builder.Services.AddLexWellsDatabase<AppDbContext>(
builder.Configuration.GetConnectionString("DefaultConnection") ?? "Data Source=news.db");
builder.Services.AddControllers();
builder.Services.AddOpenApi();
var app = builder.Build();
using (var scope = app.Services.CreateScope())
{
var context = scope.ServiceProvider.GetRequiredService<AppDbContext>();
context.Database.Migrate();
}
if (app.Environment.IsDevelopment())
{
app.MapOpenApi();
}
app.UseHttpsRedirection();
app.MapControllers();
app.MapGet("/api/check", async (string id, AppDbContext db) =>
await db.Articles.AnyAsync(a => a.Id == id)
? Results.Ok(new { exists = true })
: Results.NotFound(new { exists = false }));
app.Run();

View File

@ -0,0 +1,8 @@
using NewsArchival.Core.Models;
namespace NewsArchival.Core.Interfaces;
public interface IScraperService
{
abstract Task<HashSet<Article>> GetArticles(string hub);
}

View File

@ -0,0 +1,14 @@
using LexWells.Infrastructure.EntityFramework.Entities;
namespace NewsArchival.Core.Models;
public class Article : BaseEntity
{
public required string Title { get; set; }
public required string Author { get; set; }
public required string Url { get; set; }
public required string Category { get; set; }
public bool SavePictures => IsWarZone;
public bool IsWarZone { get; set; }
public required string Content { get; set; }
}

View File

@ -0,0 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="LexWells.Infrastructure.Common" Version="[1.0.0]" />
<PackageReference Include="LexWells.Infrastructure.EntityFramework" Version="[1.0.0]" />
</ItemGroup>
</Project>

View File

@ -0,0 +1,26 @@
FROM mcr.microsoft.com/dotnet/sdk:9.0-alpine AS build
WORKDIR /src
# Copy the solution and ALL project files first
COPY NewsArchive.sln ./
COPY nuget.config ./
COPY LocalNuGet/ ./LocalNuGet/
COPY NewsArchive.Api/*.csproj ./NewsArchive.Api/
COPY NewsArchive.Scraper/*.csproj ./NewsArchive.Scraper/
COPY NewsArchive.Core/*.csproj ./NewsArchive.Core/
COPY NewsArchive.UI/*.csproj ./NewsArchive.UI/
# Restore the whole solution - this fixes the missing HtmlAgilityPack/AngleSharp issue
RUN dotnet restore
# Now copy the actual source code
COPY . .
# Build and Publish the Scraper
WORKDIR "/src/NewsArchive.Scraper"
RUN dotnet publish "NewsArchive.Scraper.csproj" -c Release -o /app/publish --no-restore
FROM mcr.microsoft.com/dotnet/aspnet:9.0-alpine AS final
WORKDIR /app
COPY --from=build /app/publish .
ENTRYPOINT ["dotnet", "NewsArchive.Scraper.dll"]

View File

@ -0,0 +1,7 @@
namespace NewsArchival.Scraper.Models;
public class ScraperSettings
{
public HashSet<string> Hubs { get; set; } = new();
public int IntervalMinutes { get; set; }
}

View File

@ -0,0 +1,31 @@
<Project Sdk="Microsoft.NET.Sdk.Worker">
<PropertyGroup>
<TargetFramework>net9.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<UserSecretsId>dotnet-NewsArchival.Scraper-9d6e498a-92fd-49c4-863a-c7d529854490</UserSecretsId>
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="AngleSharp" Version="1.4.0" />
<PackageReference Include="HtmlAgilityPack" Version="1.12.4" />
<PackageReference Include="Microsoft.Extensions.Caching.StackExchangeRedis" Version="9.0.14" />
<PackageReference Include="Microsoft.Extensions.Hosting" Version="9.0.14"/>
<PackageReference Include="LexWells.Infrastructure.EntityFramework" Version="[1.0.0]" />
<PackageReference Include="LexWells.Infrastructure.Common" Version="[1.0.0]" />
<PackageReference Include="Microsoft.Extensions.Http" Version="9.0.14" />
</ItemGroup>
<ItemGroup>
<Content Include="..\.dockerignore">
<Link>.dockerignore</Link>
</Content>
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\NewsArchival.Api\NewsArchival.Api.csproj" />
<ProjectReference Include="..\NewsArchival.Core\NewsArchival.Core.csproj" />
</ItemGroup>
</Project>

View File

@ -0,0 +1,29 @@
using AngleSharp;
using LexWells.Infrastructure.Common;
using LexWells.Infrastructure.EntityFramework;
using NewsArchival.Api.Data;
using NewsArchival.Core.Interfaces;
using NewsArchival.Scraper;
using NewsArchival.Scraper.Services;
var builder = Host.CreateApplicationBuilder(args);
builder.Services.AddLexWellsInfrastructure(
builder.Configuration["ConnectionStrings:Redis"] ?? "localhost:6379");
builder.Services.AddLexWellsDatabase<AppDbContext>(
builder.Configuration["ConnectionStrings:DefaultConnection"] ?? "Data Source=newsarchive.db");
builder.Services.AddHttpClient("NewsApi", client =>
{
client.BaseAddress = new Uri(builder.Configuration["ScraperSettings:BaseUrl"] ?? "http://localhost:5000");
});
builder.Services.AddSingleton(BrowsingContext.New(Configuration.Default.WithDefaultLoader()));
builder.Services.AddSingleton<IScraperService, LatestService>();
builder.Services.AddHostedService<Worker>();
var host = builder.Build();
await host.RunAsync();

View File

@ -0,0 +1,131 @@
using AngleSharp;
using LexWells.Infrastructure.Common.Interfaces;
using NewsArchival.Core.Models;
using NewsArchival.Scraper.Validation;
namespace NewsArchival.Scraper.Services;
public class LatestService(
ILogger<Worker> logger,
ICacheService cache,
IRobotsService robots,
IHttpClientFactory httpClient,
IBrowsingContext browsingContext)
: ScraperService(logger, cache, robots, httpClient, browsingContext)
{
public override async Task<HashSet<Article>> GetArticles(string hub)
{
return await GetLatestFromHub(hub);
}
private const string Untitled = "Untitled";
private async Task<HashSet<Article>> GetLatestFromHub(string hub)
{
try
{
var cacheKey = $"hub:{hub.ToLower()}:latest";
var cacheTask = _cache.GetAsync<HashSet<Article>>(cacheKey);
var headlinesTask = GetCurrentHeadlines(hub);
var cachedData = await cacheTask ?? new HashSet<Article>();
var currentHeadlines = await headlinesTask;
return currentHeadlines
.Where(x => cachedData.All(c => c.Url != x.Url))
.ToHashSet();
}
catch (Exception e)
{
Console.WriteLine(e);
throw;
}
}
private async Task<HashSet<Article>> GetCurrentHeadlines(string hub)
{
var articles = new HashSet<Article>();
var hubUri = new Uri($"https://apnews.com/hub/{hub}");
if (!await _robots.CanCrawlAsync(hubUri))
{
_logger.LogWarning("Robots.txt disallowed access to {Hub}", hubUri);
return articles;
}
_logger.LogInformation("Scraping starting.");
var client = _httpClient.CreateClient("LexWellsPoliteClient");
try
{
var html = await client.GetStringAsync(hubUri);
var document = await _browsingContext.OpenAsync(req => req.Content(html));
var elements = document.QuerySelectorAll(".PageList-items-item");
var seenUrls = new HashSet<string>();
foreach (var element in elements.Take(50))
{
var linkElement = element.QuerySelector(".PagePromo-title a");
if (linkElement == null) continue;
var articleUrl = linkElement.GetAttribute("href");
if (string.IsNullOrEmpty(articleUrl)) continue;
var articleId = GenerateId(articleUrl);
if (!seenUrls.Add(articleId)) continue;
var cacheKey = $"processed_article:{articleId}";
var isAlreadyProcessed = await _cache.GetAsync<bool>(cacheKey);
if (isAlreadyProcessed)
{
_logger.LogDebug("Skipping already processed article: {Id}", articleId);
continue;
}
var article = await ScrapeFullArticle(articleUrl, hub);
if (article != null && !string.IsNullOrWhiteSpace(article.Title) && article.Title != Untitled)
{
articles.Add(article);
await _cache.SetAsync(cacheKey, true, TimeSpan.FromDays(1));
}
var delay = await _robots.GetCrawlDelayAsync(hubUri);
await Task.Delay(delay);
}
}
catch (Exception e)
{
_logger.LogError(e, "Failed to scrape hub {Hub}", hub);
}
return articles;
}
private async Task<Article?> ScrapeFullArticle(string url, string hub)
{
var document = await _browsingContext.OpenAsync(url);
var storyContent = document.QuerySelector("div.RichTextStoryBody.RichTextBody");
if (storyContent == null) return null;
return new Article
{
Id = GenerateId(url),
Title = document.QuerySelector("h1.Page-headline")?.TextContent.Trim() ?? Untitled,
Author = document.QuerySelector(".Page-authors a.Link")?.TextContent.Trim() ?? "Staff",
Url = url,
Category = hub,
Content = Advertisement.Strip(storyContent.InnerHtml),
IsWarZone = GetWarZoneStatus()
};
}
private bool GetWarZoneStatus()
{
return false;
}
}

View File

@ -0,0 +1,43 @@
using System.Security.Cryptography;
using System.Text;
using AngleSharp;
using LexWells.Infrastructure.Common.Interfaces;
using NewsArchival.Core.Interfaces;
using NewsArchival.Core.Models;
namespace NewsArchival.Scraper.Services;
public abstract class ScraperService(
ILogger<Worker> logger,
ICacheService cache,
IRobotsService robots,
IHttpClientFactory httpClient,
IBrowsingContext browsingContext)
: IScraperService
{
protected readonly ILogger<Worker> _logger = logger;
protected readonly ICacheService _cache = cache;
protected readonly IRobotsService _robots = robots;
protected readonly IHttpClientFactory _httpClient = httpClient;
protected readonly IBrowsingContext _browsingContext = browsingContext;
#region IScraperService
public abstract Task<HashSet<Article>> GetArticles(string hub);
#endregion
protected static string GenerateId(string url)
{
if (string.IsNullOrWhiteSpace(url)) return "unknown";
var baseUrl = new Uri("https://apnews.com");
var uri = new Uri(baseUrl, url);
var cleanPath = uri.AbsolutePath.TrimEnd('/');
var normalizedUrl = $"{uri.Scheme}://{uri.Host}{cleanPath}".ToLowerInvariant();
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalizedUrl));
return Convert.ToHexString(bytes).ToLower();
}
}

View File

@ -0,0 +1,41 @@
using System.Text;
using HtmlAgilityPack;
namespace NewsArchival.Scraper.Validation;
public class Advertisement
{
public static string Strip(string htmlContent)
{
try
{
if (string.IsNullOrWhiteSpace(htmlContent)) return htmlContent;
var doc = new HtmlDocument();
doc.LoadHtml(htmlContent);
var xpath = "//p | //a | //*[starts-with(name(), 'h') and string-length(name()) = 2]";
var nodesToKeep = doc.DocumentNode.SelectNodes(xpath);
if (nodesToKeep is null) return string.Empty;
var sb = new StringBuilder();
foreach (var node in nodesToKeep)
{
// Get the text, trim whitespace, and add a newline for readability
string text = HtmlEntity.DeEntitize(node.InnerText).Trim();
if (!string.IsNullOrWhiteSpace(text))
{
sb.AppendLine(text);
sb.AppendLine(); // Adds spacing between paragraphs/headers
}
}
return sb.ToString().Trim();
}
catch (Exception e)
{
Console.WriteLine(e);
throw;
}
}
}

View File

@ -0,0 +1,81 @@
using System.Net.Http.Json;
using LexWells.Infrastructure.Common.Interfaces;
using NewsArchival.Core.Interfaces;
namespace NewsArchival.Scraper;
public class Worker(
ILogger<Worker> logger,
IConfiguration configuration,
IScraperService scraperService,
IRobotsService robotsService,
IHttpClientFactory httpClient) : BackgroundService
{
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
var apiClient = httpClient.CreateClient("NewsApi");
var hubs = configuration.GetSection("ScraperSettings:Hubs").Get<List<string>>() ?? [];
logger.LogInformation("Scraper Worker started at: {time}", DateTimeOffset.Now);
while (!stoppingToken.IsCancellationRequested)
{
var cycleStartTime = DateTime.UtcNow;
foreach (var hub in hubs)
{
try
{
logger.LogInformation("Processing hub: {Hub}", hub);
var articles = await scraperService.GetArticles(hub);
foreach (var article in articles)
{
if (await ArticleExistsAsync(apiClient, article.Id, stoppingToken)) continue;
var response = await apiClient.PostAsJsonAsync("api/articles", article, stoppingToken);
if (response.IsSuccessStatusCode)
{
logger.LogInformation("Successfully archived: {Title}", article.Title);
}
else
{
var error = await response.Content.ReadAsStringAsync(stoppingToken);
logger.LogError("API Rejected {Id}: {Status} - {Error}", article.Id, response.StatusCode, error);
}
}
var uri = new Uri("https://apnews.com");
var delayMs = await robotsService.GetCrawlDelayAsync(uri);
await Task.Delay(delayMs, stoppingToken);
}
catch (Exception ex)
{
logger.LogError(ex, "Critical error processing hub {Hub}", hub);
}
}
await ApplyGlobalDelay(cycleStartTime, stoppingToken);
}
}
private async Task<bool> ArticleExistsAsync(HttpClient client, string id, CancellationToken ct)
{
var response = await client.GetAsync($"api/check?id={id}", ct);
return response.IsSuccessStatusCode;
}
private async Task ApplyGlobalDelay(DateTime startTime, CancellationToken ct)
{
var elapsed = DateTime.UtcNow - startTime;
var sleepTime = TimeSpan.FromHours(1) - elapsed;
if (sleepTime > TimeSpan.Zero)
{
logger.LogInformation("Cycle complete. Sleeping for {Minutes} minutes...", sleepTime.TotalMinutes);
await Task.Delay(sleepTime, ct);
}
}
}

33
NewsArchival.sln Normal file
View File

@ -0,0 +1,33 @@

Microsoft Visual Studio Solution File, Format Version 12.00
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NewsArchival.Scraper", "NewsArchival.Scraper\NewsArchival.Scraper.csproj", "{D0A4FF28-F8C4-4897-9323-13A343A424FE}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{C85ACB1A-438D-4CD5-AA25-D8A854B7B6DA}"
ProjectSection(SolutionItems) = preProject
compose.yaml = compose.yaml
EndProjectSection
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NewsArchival.Api", "NewsArchival.Api\NewsArchival.Api.csproj", "{F8795C4C-3F3A-41A9-923C-F5013762F36F}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NewsArchival.Core", "NewsArchival.Core\NewsArchival.Core.csproj", "{D4FFE4CB-834B-46AC-83DD-CB5035E04403}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{D0A4FF28-F8C4-4897-9323-13A343A424FE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D0A4FF28-F8C4-4897-9323-13A343A424FE}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D0A4FF28-F8C4-4897-9323-13A343A424FE}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D0A4FF28-F8C4-4897-9323-13A343A424FE}.Release|Any CPU.Build.0 = Release|Any CPU
{F8795C4C-3F3A-41A9-923C-F5013762F36F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F8795C4C-3F3A-41A9-923C-F5013762F36F}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F8795C4C-3F3A-41A9-923C-F5013762F36F}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F8795C4C-3F3A-41A9-923C-F5013762F36F}.Release|Any CPU.Build.0 = Release|Any CPU
{D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
EndGlobal

0
README.Md Normal file
View File

7
compose.yaml Normal file
View File

@ -0,0 +1,7 @@
services:
newsarchival.scraper:
image: newsarchival.scraper
build:
context: .
dockerfile: NewsArchival.Scraper/Dockerfile