From 426cba1dd363aa98797317ab94457ad1b7298c5f Mon Sep 17 00:00:00 2001 From: drew Date: Wed, 1 Apr 2026 23:00:49 -0400 Subject: [PATCH] Initial commit --- .dockerignore | 27 ++++ .gitignore | 24 ++++ .../Controllers/ArticlesController.cs | 48 +++++++ NewsArchival.Api/Data/AppDbContext.cs | 20 +++ NewsArchival.Api/Dockerfile | 23 +++ NewsArchival.Api/NewsArchival.Api.csproj | 35 +++++ NewsArchival.Api/NewsArchival.Api.http | 11 ++ NewsArchival.Api/Program.cs | 42 ++++++ .../Interfaces/IScraperService.cs | 8 ++ NewsArchival.Core/Models/Article.cs | 14 ++ NewsArchival.Core/NewsArchival.Core.csproj | 14 ++ NewsArchival.Scraper/Dockerfile | 26 ++++ .../Models/ScraperSettings.cs | 7 + .../NewsArchival.Scraper.csproj | 31 +++++ NewsArchival.Scraper/Program.cs | 29 ++++ .../Services/LatestService.cs | 131 ++++++++++++++++++ .../Services/ScraperService.cs | 43 ++++++ .../Validation/Advertisement.cs | 41 ++++++ NewsArchival.Scraper/Worker.cs | 81 +++++++++++ NewsArchival.sln | 33 +++++ README.Md | 0 compose.yaml | 7 + 22 files changed, 695 insertions(+) create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 NewsArchival.Api/Controllers/ArticlesController.cs create mode 100644 NewsArchival.Api/Data/AppDbContext.cs create mode 100644 NewsArchival.Api/Dockerfile create mode 100644 NewsArchival.Api/NewsArchival.Api.csproj create mode 100644 NewsArchival.Api/NewsArchival.Api.http create mode 100644 NewsArchival.Api/Program.cs create mode 100644 NewsArchival.Core/Interfaces/IScraperService.cs create mode 100644 NewsArchival.Core/Models/Article.cs create mode 100644 NewsArchival.Core/NewsArchival.Core.csproj create mode 100644 NewsArchival.Scraper/Dockerfile create mode 100644 NewsArchival.Scraper/Models/ScraperSettings.cs create mode 100644 NewsArchival.Scraper/NewsArchival.Scraper.csproj create mode 100644 NewsArchival.Scraper/Program.cs create mode 100644 NewsArchival.Scraper/Services/LatestService.cs create mode 100644 NewsArchival.Scraper/Services/ScraperService.cs create mode 100644 NewsArchival.Scraper/Validation/Advertisement.cs create mode 100644 NewsArchival.Scraper/Worker.cs create mode 100644 NewsArchival.sln create mode 100644 README.Md create mode 100644 compose.yaml diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a46963e --- /dev/null +++ b/.dockerignore @@ -0,0 +1,27 @@ +**/.dockerignore +**/.env +**/.git +**/.gitignore +**/.project +**/.settings +**/.toolstarget +**/.vs +**/.vscode +**/.idea +**/*.*proj.user +**/*.db +**/*.dbmdl +**/*.jfm +**/azds.yaml +**/bin +**/charts +**/docker-compose* +**/Dockerfile* +**/node_modules +**/npm-debug.log +**/obj +**/secrets.dev.yaml +**/values.dev.yaml +**/*.user +LICENSE +README.md \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ff5f515 --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# IDEs +.idea/ +.vs/ +.vscode/ + +# Build outputs +bin/ +obj/ +dist/ +publish/ + +# Archives and Packages +*.tar.gz +*.zip +*.nupkg + +# Databases +*.db +*.db-shm +*.db-wal + +# User-specific settings +*.sln.DotSettings.user +appsettings*.json**/Properties/launchSettings.json diff --git a/NewsArchival.Api/Controllers/ArticlesController.cs b/NewsArchival.Api/Controllers/ArticlesController.cs new file mode 100644 index 0000000..98b8bc1 --- /dev/null +++ b/NewsArchival.Api/Controllers/ArticlesController.cs @@ -0,0 +1,48 @@ +using Microsoft.AspNetCore.Mvc; +using LexWells.Infrastructure.Common.Interfaces; +using Microsoft.EntityFrameworkCore; +using NewsArchival.Api.Data; +using NewsArchival.Core.Models; + +namespace NewsArchival.Api.Controllers; + +[ApiController] +[Route("api/[controller]")] +public class ArticlesController(AppDbContext db, ICacheService cache) : ControllerBase +{ + [HttpPost] + public async Task PostArticle([FromBody] Article article) + { + if (!ModelState.IsValid) return BadRequest(ModelState); + + db.Articles.Add(article); + await db.SaveChangesAsync(); + + await cache.RemoveAsync($"{article.Category.ToLower()}:all_articles"); + + return CreatedAtAction( + nameof(GetArticles), + new { hub = article.Category.ToLower() }, + article); + } + + [HttpGet("{hub}")] + public async Task GetArticles(string hub) + { + var cacheKey = $"{hub.ToLower()}:all_articles"; + + var articles = await cache.GetAsync>(cacheKey); + + if (articles == null) + { + articles = await db.Articles + .Where(x => x.Category == hub) + .OrderByDescending(a => a.CreatedAt) + .ToListAsync(); + + await cache.SetAsync(cacheKey, articles, TimeSpan.FromHours(1)); + } + + return Ok(articles); + } +} \ No newline at end of file diff --git a/NewsArchival.Api/Data/AppDbContext.cs b/NewsArchival.Api/Data/AppDbContext.cs new file mode 100644 index 0000000..5142a4e --- /dev/null +++ b/NewsArchival.Api/Data/AppDbContext.cs @@ -0,0 +1,20 @@ +using LexWells.Infrastructure.EntityFramework; +using Microsoft.EntityFrameworkCore; +using NewsArchival.Core.Models; + +namespace NewsArchival.Api.Data; + +public class AppDbContext(DbContextOptions options) : LexWellsDbContext(options) +{ + public DbSet
Articles => Set
(); + + protected override void OnModelCreating(ModelBuilder modelBuilder) + { + base.OnModelCreating(modelBuilder); + + // Ensure we don't save the same URL twice + modelBuilder.Entity
() + .HasIndex(a => a.Url) + .IsUnique(); + } +} \ No newline at end of file diff --git a/NewsArchival.Api/Dockerfile b/NewsArchival.Api/Dockerfile new file mode 100644 index 0000000..84bc068 --- /dev/null +++ b/NewsArchival.Api/Dockerfile @@ -0,0 +1,23 @@ +FROM mcr.microsoft.com/dotnet/sdk:9.0-alpine AS build +WORKDIR /src + +COPY NewsArchive.sln ./ +COPY nuget.config ./ +COPY LocalNuGet/ ./LocalNuGet/ +COPY NewsArchive.Api/*.csproj ./NewsArchive.Api/ +COPY NewsArchive.Scraper/*.csproj ./NewsArchive.Scraper/ +COPY NewsArchive.Core/*.csproj ./NewsArchive.Core/ +COPY NewsArchive.UI/*.csproj ./NewsArchive.UI/ + +RUN dotnet restore + +COPY . . + +WORKDIR "/src/NewsArchive.Api" +RUN dotnet publish "NewsArchive.Api.csproj" -c Release -o /app/publish --no-restore + +FROM mcr.microsoft.com/dotnet/aspnet:9.0-alpine AS final +WORKDIR /app +RUN mkdir -p /app/data && chown -R 1000:1000 /app/data +COPY --from=build /app/publish . +ENTRYPOINT ["dotnet", "NewsArchive.Api.dll"] \ No newline at end of file diff --git a/NewsArchival.Api/NewsArchival.Api.csproj b/NewsArchival.Api/NewsArchival.Api.csproj new file mode 100644 index 0000000..2b356e4 --- /dev/null +++ b/NewsArchival.Api/NewsArchival.Api.csproj @@ -0,0 +1,35 @@ + + + + net9.0 + enable + enable + true + true + Linux + + + + + .dockerignore + + + + + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + diff --git a/NewsArchival.Api/NewsArchival.Api.http b/NewsArchival.Api/NewsArchival.Api.http new file mode 100644 index 0000000..97c113f --- /dev/null +++ b/NewsArchival.Api/NewsArchival.Api.http @@ -0,0 +1,11 @@ +@NewsArchival.Api_HostAddress = http://localhost:5297 + +GET {{NewsArchival.Api_HostAddress}}/todos/ +Accept: application/json + +### + +GET {{NewsArchival.Api_HostAddress}}/todos/1 +Accept: application/json + +### diff --git a/NewsArchival.Api/Program.cs b/NewsArchival.Api/Program.cs new file mode 100644 index 0000000..9acb374 --- /dev/null +++ b/NewsArchival.Api/Program.cs @@ -0,0 +1,42 @@ +using LexWells.Infrastructure.Common; +using LexWells.Infrastructure.EntityFramework; +using Microsoft.EntityFrameworkCore; +using NewsArchival.Api.Data; + +var builder = WebApplication.CreateBuilder(args); + +// 1. Infrastructure First (Redis + HttpClient + Robots) +builder.Services.AddLexWellsInfrastructure( + builder.Configuration.GetConnectionString("Redis") ?? "localhost:6379"); + +// 2. Database Support +builder.Services.AddLexWellsDatabase( + builder.Configuration.GetConnectionString("DefaultConnection") ?? "Data Source=news.db"); + +builder.Services.AddControllers(); +builder.Services.AddOpenApi(); + +var app = builder.Build(); + +using (var scope = app.Services.CreateScope()) +{ + var context = scope.ServiceProvider.GetRequiredService(); + + context.Database.Migrate(); +} + +if (app.Environment.IsDevelopment()) +{ + app.MapOpenApi(); +} + +app.UseHttpsRedirection(); + +app.MapControllers(); + +app.MapGet("/api/check", async (string id, AppDbContext db) => + await db.Articles.AnyAsync(a => a.Id == id) + ? Results.Ok(new { exists = true }) + : Results.NotFound(new { exists = false })); + +app.Run(); \ No newline at end of file diff --git a/NewsArchival.Core/Interfaces/IScraperService.cs b/NewsArchival.Core/Interfaces/IScraperService.cs new file mode 100644 index 0000000..08eaa07 --- /dev/null +++ b/NewsArchival.Core/Interfaces/IScraperService.cs @@ -0,0 +1,8 @@ +using NewsArchival.Core.Models; + +namespace NewsArchival.Core.Interfaces; + +public interface IScraperService +{ + abstract Task> GetArticles(string hub); +} \ No newline at end of file diff --git a/NewsArchival.Core/Models/Article.cs b/NewsArchival.Core/Models/Article.cs new file mode 100644 index 0000000..2492074 --- /dev/null +++ b/NewsArchival.Core/Models/Article.cs @@ -0,0 +1,14 @@ +using LexWells.Infrastructure.EntityFramework.Entities; + +namespace NewsArchival.Core.Models; + +public class Article : BaseEntity +{ + public required string Title { get; set; } + public required string Author { get; set; } + public required string Url { get; set; } + public required string Category { get; set; } + public bool SavePictures => IsWarZone; + public bool IsWarZone { get; set; } + public required string Content { get; set; } +} \ No newline at end of file diff --git a/NewsArchival.Core/NewsArchival.Core.csproj b/NewsArchival.Core/NewsArchival.Core.csproj new file mode 100644 index 0000000..6bd7cb7 --- /dev/null +++ b/NewsArchival.Core/NewsArchival.Core.csproj @@ -0,0 +1,14 @@ + + + + net9.0 + enable + enable + + + + + + + + diff --git a/NewsArchival.Scraper/Dockerfile b/NewsArchival.Scraper/Dockerfile new file mode 100644 index 0000000..a0df6b7 --- /dev/null +++ b/NewsArchival.Scraper/Dockerfile @@ -0,0 +1,26 @@ +FROM mcr.microsoft.com/dotnet/sdk:9.0-alpine AS build +WORKDIR /src + +# Copy the solution and ALL project files first +COPY NewsArchive.sln ./ +COPY nuget.config ./ +COPY LocalNuGet/ ./LocalNuGet/ +COPY NewsArchive.Api/*.csproj ./NewsArchive.Api/ +COPY NewsArchive.Scraper/*.csproj ./NewsArchive.Scraper/ +COPY NewsArchive.Core/*.csproj ./NewsArchive.Core/ +COPY NewsArchive.UI/*.csproj ./NewsArchive.UI/ + +# Restore the whole solution - this fixes the missing HtmlAgilityPack/AngleSharp issue +RUN dotnet restore + +# Now copy the actual source code +COPY . . + +# Build and Publish the Scraper +WORKDIR "/src/NewsArchive.Scraper" +RUN dotnet publish "NewsArchive.Scraper.csproj" -c Release -o /app/publish --no-restore + +FROM mcr.microsoft.com/dotnet/aspnet:9.0-alpine AS final +WORKDIR /app +COPY --from=build /app/publish . +ENTRYPOINT ["dotnet", "NewsArchive.Scraper.dll"] \ No newline at end of file diff --git a/NewsArchival.Scraper/Models/ScraperSettings.cs b/NewsArchival.Scraper/Models/ScraperSettings.cs new file mode 100644 index 0000000..33ee0dc --- /dev/null +++ b/NewsArchival.Scraper/Models/ScraperSettings.cs @@ -0,0 +1,7 @@ +namespace NewsArchival.Scraper.Models; + +public class ScraperSettings +{ + public HashSet Hubs { get; set; } = new(); + public int IntervalMinutes { get; set; } +} \ No newline at end of file diff --git a/NewsArchival.Scraper/NewsArchival.Scraper.csproj b/NewsArchival.Scraper/NewsArchival.Scraper.csproj new file mode 100644 index 0000000..8a911d8 --- /dev/null +++ b/NewsArchival.Scraper/NewsArchival.Scraper.csproj @@ -0,0 +1,31 @@ + + + + net9.0 + enable + enable + dotnet-NewsArchival.Scraper-9d6e498a-92fd-49c4-863a-c7d529854490 + Linux + + + + + + + + + + + + + + + .dockerignore + + + + + + + + diff --git a/NewsArchival.Scraper/Program.cs b/NewsArchival.Scraper/Program.cs new file mode 100644 index 0000000..724f0a8 --- /dev/null +++ b/NewsArchival.Scraper/Program.cs @@ -0,0 +1,29 @@ +using AngleSharp; +using LexWells.Infrastructure.Common; +using LexWells.Infrastructure.EntityFramework; +using NewsArchival.Api.Data; +using NewsArchival.Core.Interfaces; +using NewsArchival.Scraper; +using NewsArchival.Scraper.Services; + +var builder = Host.CreateApplicationBuilder(args); + +builder.Services.AddLexWellsInfrastructure( + builder.Configuration["ConnectionStrings:Redis"] ?? "localhost:6379"); + +builder.Services.AddLexWellsDatabase( + builder.Configuration["ConnectionStrings:DefaultConnection"] ?? "Data Source=newsarchive.db"); + +builder.Services.AddHttpClient("NewsApi", client => +{ + client.BaseAddress = new Uri(builder.Configuration["ScraperSettings:BaseUrl"] ?? "http://localhost:5000"); +}); + +builder.Services.AddSingleton(BrowsingContext.New(Configuration.Default.WithDefaultLoader())); + +builder.Services.AddSingleton(); + +builder.Services.AddHostedService(); + +var host = builder.Build(); +await host.RunAsync(); \ No newline at end of file diff --git a/NewsArchival.Scraper/Services/LatestService.cs b/NewsArchival.Scraper/Services/LatestService.cs new file mode 100644 index 0000000..18c9b6c --- /dev/null +++ b/NewsArchival.Scraper/Services/LatestService.cs @@ -0,0 +1,131 @@ +using AngleSharp; +using LexWells.Infrastructure.Common.Interfaces; +using NewsArchival.Core.Models; +using NewsArchival.Scraper.Validation; + +namespace NewsArchival.Scraper.Services; + +public class LatestService( + ILogger logger, + ICacheService cache, + IRobotsService robots, + IHttpClientFactory httpClient, + IBrowsingContext browsingContext) + : ScraperService(logger, cache, robots, httpClient, browsingContext) +{ + public override async Task> GetArticles(string hub) + { + return await GetLatestFromHub(hub); + } + + private const string Untitled = "Untitled"; + + private async Task> GetLatestFromHub(string hub) + { + try + { + var cacheKey = $"hub:{hub.ToLower()}:latest"; + + var cacheTask = _cache.GetAsync>(cacheKey); + var headlinesTask = GetCurrentHeadlines(hub); + + var cachedData = await cacheTask ?? new HashSet
(); + var currentHeadlines = await headlinesTask; + + return currentHeadlines + .Where(x => cachedData.All(c => c.Url != x.Url)) + .ToHashSet(); + } + catch (Exception e) + { + Console.WriteLine(e); + throw; + } + } + + private async Task> GetCurrentHeadlines(string hub) + { + var articles = new HashSet
(); + var hubUri = new Uri($"https://apnews.com/hub/{hub}"); + + if (!await _robots.CanCrawlAsync(hubUri)) + { + _logger.LogWarning("Robots.txt disallowed access to {Hub}", hubUri); + return articles; + } + + _logger.LogInformation("Scraping starting."); + var client = _httpClient.CreateClient("LexWellsPoliteClient"); + + try + { + var html = await client.GetStringAsync(hubUri); + var document = await _browsingContext.OpenAsync(req => req.Content(html)); + var elements = document.QuerySelectorAll(".PageList-items-item"); + + var seenUrls = new HashSet(); + foreach (var element in elements.Take(50)) + { + var linkElement = element.QuerySelector(".PagePromo-title a"); + if (linkElement == null) continue; + + var articleUrl = linkElement.GetAttribute("href"); + if (string.IsNullOrEmpty(articleUrl)) continue; + + var articleId = GenerateId(articleUrl); + + if (!seenUrls.Add(articleId)) continue; + + var cacheKey = $"processed_article:{articleId}"; + var isAlreadyProcessed = await _cache.GetAsync(cacheKey); + + if (isAlreadyProcessed) + { + _logger.LogDebug("Skipping already processed article: {Id}", articleId); + continue; + } + + var article = await ScrapeFullArticle(articleUrl, hub); + if (article != null && !string.IsNullOrWhiteSpace(article.Title) && article.Title != Untitled) + { + articles.Add(article); + + await _cache.SetAsync(cacheKey, true, TimeSpan.FromDays(1)); + } + + var delay = await _robots.GetCrawlDelayAsync(hubUri); + await Task.Delay(delay); + } + } + catch (Exception e) + { + _logger.LogError(e, "Failed to scrape hub {Hub}", hub); + } + + return articles; + } + + private async Task ScrapeFullArticle(string url, string hub) + { + var document = await _browsingContext.OpenAsync(url); + + var storyContent = document.QuerySelector("div.RichTextStoryBody.RichTextBody"); + if (storyContent == null) return null; + + return new Article + { + Id = GenerateId(url), + Title = document.QuerySelector("h1.Page-headline")?.TextContent.Trim() ?? Untitled, + Author = document.QuerySelector(".Page-authors a.Link")?.TextContent.Trim() ?? "Staff", + Url = url, + Category = hub, + Content = Advertisement.Strip(storyContent.InnerHtml), + IsWarZone = GetWarZoneStatus() + }; + } + + private bool GetWarZoneStatus() + { + return false; + } +} \ No newline at end of file diff --git a/NewsArchival.Scraper/Services/ScraperService.cs b/NewsArchival.Scraper/Services/ScraperService.cs new file mode 100644 index 0000000..906df25 --- /dev/null +++ b/NewsArchival.Scraper/Services/ScraperService.cs @@ -0,0 +1,43 @@ +using System.Security.Cryptography; +using System.Text; +using AngleSharp; +using LexWells.Infrastructure.Common.Interfaces; +using NewsArchival.Core.Interfaces; +using NewsArchival.Core.Models; + +namespace NewsArchival.Scraper.Services; + +public abstract class ScraperService( + ILogger logger, + ICacheService cache, + IRobotsService robots, + IHttpClientFactory httpClient, + IBrowsingContext browsingContext) + : IScraperService +{ + protected readonly ILogger _logger = logger; + protected readonly ICacheService _cache = cache; + protected readonly IRobotsService _robots = robots; + protected readonly IHttpClientFactory _httpClient = httpClient; + protected readonly IBrowsingContext _browsingContext = browsingContext; + + #region IScraperService + + public abstract Task> GetArticles(string hub); + + #endregion + + protected static string GenerateId(string url) + { + if (string.IsNullOrWhiteSpace(url)) return "unknown"; + + var baseUrl = new Uri("https://apnews.com"); + var uri = new Uri(baseUrl, url); + + var cleanPath = uri.AbsolutePath.TrimEnd('/'); + var normalizedUrl = $"{uri.Scheme}://{uri.Host}{cleanPath}".ToLowerInvariant(); + + var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(normalizedUrl)); + return Convert.ToHexString(bytes).ToLower(); + } +} \ No newline at end of file diff --git a/NewsArchival.Scraper/Validation/Advertisement.cs b/NewsArchival.Scraper/Validation/Advertisement.cs new file mode 100644 index 0000000..565405a --- /dev/null +++ b/NewsArchival.Scraper/Validation/Advertisement.cs @@ -0,0 +1,41 @@ +using System.Text; +using HtmlAgilityPack; + +namespace NewsArchival.Scraper.Validation; + +public class Advertisement +{ + public static string Strip(string htmlContent) + { + try + { + if (string.IsNullOrWhiteSpace(htmlContent)) return htmlContent; + + var doc = new HtmlDocument(); + doc.LoadHtml(htmlContent); + + var xpath = "//p | //a | //*[starts-with(name(), 'h') and string-length(name()) = 2]"; + var nodesToKeep = doc.DocumentNode.SelectNodes(xpath); + if (nodesToKeep is null) return string.Empty; + + var sb = new StringBuilder(); + foreach (var node in nodesToKeep) + { + // Get the text, trim whitespace, and add a newline for readability + string text = HtmlEntity.DeEntitize(node.InnerText).Trim(); + if (!string.IsNullOrWhiteSpace(text)) + { + sb.AppendLine(text); + sb.AppendLine(); // Adds spacing between paragraphs/headers + } + } + + return sb.ToString().Trim(); + } + catch (Exception e) + { + Console.WriteLine(e); + throw; + } + } +} \ No newline at end of file diff --git a/NewsArchival.Scraper/Worker.cs b/NewsArchival.Scraper/Worker.cs new file mode 100644 index 0000000..f073fe1 --- /dev/null +++ b/NewsArchival.Scraper/Worker.cs @@ -0,0 +1,81 @@ +using System.Net.Http.Json; +using LexWells.Infrastructure.Common.Interfaces; +using NewsArchival.Core.Interfaces; + +namespace NewsArchival.Scraper; + +public class Worker( + ILogger logger, + IConfiguration configuration, + IScraperService scraperService, + IRobotsService robotsService, + IHttpClientFactory httpClient) : BackgroundService +{ + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + var apiClient = httpClient.CreateClient("NewsApi"); + var hubs = configuration.GetSection("ScraperSettings:Hubs").Get>() ?? []; + + logger.LogInformation("Scraper Worker started at: {time}", DateTimeOffset.Now); + + while (!stoppingToken.IsCancellationRequested) + { + var cycleStartTime = DateTime.UtcNow; + + foreach (var hub in hubs) + { + try + { + logger.LogInformation("Processing hub: {Hub}", hub); + + var articles = await scraperService.GetArticles(hub); + + foreach (var article in articles) + { + if (await ArticleExistsAsync(apiClient, article.Id, stoppingToken)) continue; + + var response = await apiClient.PostAsJsonAsync("api/articles", article, stoppingToken); + + if (response.IsSuccessStatusCode) + { + logger.LogInformation("Successfully archived: {Title}", article.Title); + } + else + { + var error = await response.Content.ReadAsStringAsync(stoppingToken); + logger.LogError("API Rejected {Id}: {Status} - {Error}", article.Id, response.StatusCode, error); + } + } + + var uri = new Uri("https://apnews.com"); + var delayMs = await robotsService.GetCrawlDelayAsync(uri); + await Task.Delay(delayMs, stoppingToken); + } + catch (Exception ex) + { + logger.LogError(ex, "Critical error processing hub {Hub}", hub); + } + } + + await ApplyGlobalDelay(cycleStartTime, stoppingToken); + } + } + + private async Task ArticleExistsAsync(HttpClient client, string id, CancellationToken ct) + { + var response = await client.GetAsync($"api/check?id={id}", ct); + return response.IsSuccessStatusCode; + } + + private async Task ApplyGlobalDelay(DateTime startTime, CancellationToken ct) + { + var elapsed = DateTime.UtcNow - startTime; + var sleepTime = TimeSpan.FromHours(1) - elapsed; + + if (sleepTime > TimeSpan.Zero) + { + logger.LogInformation("Cycle complete. Sleeping for {Minutes} minutes...", sleepTime.TotalMinutes); + await Task.Delay(sleepTime, ct); + } + } +} \ No newline at end of file diff --git a/NewsArchival.sln b/NewsArchival.sln new file mode 100644 index 0000000..97e7461 --- /dev/null +++ b/NewsArchival.sln @@ -0,0 +1,33 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NewsArchival.Scraper", "NewsArchival.Scraper\NewsArchival.Scraper.csproj", "{D0A4FF28-F8C4-4897-9323-13A343A424FE}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{C85ACB1A-438D-4CD5-AA25-D8A854B7B6DA}" + ProjectSection(SolutionItems) = preProject + compose.yaml = compose.yaml + EndProjectSection +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NewsArchival.Api", "NewsArchival.Api\NewsArchival.Api.csproj", "{F8795C4C-3F3A-41A9-923C-F5013762F36F}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NewsArchival.Core", "NewsArchival.Core\NewsArchival.Core.csproj", "{D4FFE4CB-834B-46AC-83DD-CB5035E04403}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D0A4FF28-F8C4-4897-9323-13A343A424FE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D0A4FF28-F8C4-4897-9323-13A343A424FE}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D0A4FF28-F8C4-4897-9323-13A343A424FE}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D0A4FF28-F8C4-4897-9323-13A343A424FE}.Release|Any CPU.Build.0 = Release|Any CPU + {F8795C4C-3F3A-41A9-923C-F5013762F36F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F8795C4C-3F3A-41A9-923C-F5013762F36F}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F8795C4C-3F3A-41A9-923C-F5013762F36F}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F8795C4C-3F3A-41A9-923C-F5013762F36F}.Release|Any CPU.Build.0 = Release|Any CPU + {D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D4FFE4CB-834B-46AC-83DD-CB5035E04403}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal diff --git a/README.Md b/README.Md new file mode 100644 index 0000000..e69de29 diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..103b152 --- /dev/null +++ b/compose.yaml @@ -0,0 +1,7 @@ +services: + newsarchival.scraper: + image: newsarchival.scraper + build: + context: . + dockerfile: NewsArchival.Scraper/Dockerfile +