using System.Net; using Microsoft.Extensions.Logging; using LexWells.Infrastructure.Common.Interfaces; using LexWells.Infrastructure.Common.Models; namespace LexWells.Infrastructure.Common.Services; public class RobotsService( IHttpClientFactory clientFactory, ICacheService cache, ILogger logger) : IRobotsService { private const string CachePrefix = "robots_"; private const string DefaultUserAgent = "NewsArchiveBot"; public async Task CanCrawlAsync(Uri uri, string userAgent = DefaultUserAgent) { var entry = await GetRobotsAsync(uri, userAgent); return entry.IsAllowed; } public async Task GetCrawlDelayAsync(Uri uri, string userAgent = DefaultUserAgent) { var entry = await GetRobotsAsync(uri, userAgent); return entry.CrawlDelay.HasValue ? TimeSpan.FromSeconds(entry.CrawlDelay.Value) : TimeSpan.FromSeconds(1); } private async Task GetRobotsAsync(Uri uri, string userAgent) { string host = uri.Host; string cacheKey = $"{CachePrefix}{host}"; try { var cached = await cache.GetAsync(cacheKey); if (cached != null && cached.ExpiresAt > DateTime.UtcNow) { return cached; } } catch (Exception ex) { logger.LogWarning(ex, "Failed to retrieve or deserialize robots cache for {Host}", host); } var entry = await FetchAndParseRobotsAsync(uri, userAgent); var cacheDuration = entry.IsAllowed ? TimeSpan.FromDays(1) : TimeSpan.FromMinutes(10); try { await cache.SetAsync(cacheKey, entry, cacheDuration); } catch (Exception ex) { logger.LogError(ex, "Failed to save robots entry to cache for {Host}", host); } return entry; } private async Task FetchAndParseRobotsAsync(Uri uri, string userAgent) { var client = clientFactory.CreateClient(); var robotsUrl = $"{uri.Scheme}://{uri.Host}/robots.txt"; client.DefaultRequestHeaders.UserAgent.ParseAdd( "Mozilla/5.0 (compatible; NewsArchiveBot/1.0; +http://lexwells.com/bot)"); try { var response = await client.GetStringAsync(robotsUrl); return ParseRobotsContent(response, userAgent, uri.AbsolutePath); } catch (HttpRequestException ex) when (ex.StatusCode == HttpStatusCode.NotFound) { return new RobotsEntry(true, null, null, DateTime.UtcNow.AddDays(1)); } catch (Exception ex) { logger.LogError(ex, "Error fetching robots.txt from {Url}", robotsUrl); return new RobotsEntry(false, null, null, DateTime.UtcNow.AddMinutes(10)); } } private RobotsEntry ParseRobotsContent(string content, string userAgent, string path) { bool isAllowed = true; int? crawlDelay = null; bool foundSpecificMatch = false; string myBotName = userAgent.ToLower(); var lines = content.Split(new[] { "\n", "\r\n" }, StringSplitOptions.RemoveEmptyEntries); bool inRelevantSection = false; foreach (var line in lines) { var cleanLine = line.Trim(); if (string.IsNullOrWhiteSpace(cleanLine) || cleanLine.StartsWith("#")) continue; var parts = cleanLine.Split(':', 2); if (parts.Length < 2) continue; var key = parts[0].Trim().ToLower(); var value = parts[1].Trim(); if (key == "user-agent") { var currentAgent = value.ToLower(); if (currentAgent == myBotName) { inRelevantSection = true; foundSpecificMatch = true; isAllowed = true; crawlDelay = null; } else if (currentAgent == "*" && !foundSpecificMatch) { inRelevantSection = true; } else { inRelevantSection = false; } continue; } if (inRelevantSection) { if (key == "disallow") { if (string.IsNullOrEmpty(value)) isAllowed = true; else if (path.StartsWith(value, StringComparison.OrdinalIgnoreCase)) isAllowed = false; } else if (key == "allow") { if (path.StartsWith(value, StringComparison.OrdinalIgnoreCase)) isAllowed = true; } else if (key == "crawl-delay") { if (int.TryParse(value, out int delay)) crawlDelay = delay; } } } return new RobotsEntry(isAllowed, crawlDelay, null, DateTime.UtcNow.AddDays(1)); } }