LexWells.Infrastructure/LexWells.Infrastructure.Common/Services/RobotsService.cs

151 lines
5.0 KiB
C#

using System.Net;
using Microsoft.Extensions.Logging;
using LexWells.Infrastructure.Common.Interfaces;
using LexWells.Infrastructure.Common.Models;
namespace LexWells.Infrastructure.Common.Services;
public class RobotsService(
IHttpClientFactory clientFactory,
ICacheService cache,
ILogger<RobotsService> logger) : IRobotsService
{
private const string CachePrefix = "robots_";
private const string DefaultUserAgent = "NewsArchiveBot";
public async Task<bool> CanCrawlAsync(Uri uri, string userAgent = DefaultUserAgent)
{
var entry = await GetRobotsAsync(uri, userAgent);
return entry.IsAllowed;
}
public async Task<TimeSpan> GetCrawlDelayAsync(Uri uri, string userAgent = DefaultUserAgent)
{
var entry = await GetRobotsAsync(uri, userAgent);
return entry.CrawlDelay.HasValue
? TimeSpan.FromSeconds(entry.CrawlDelay.Value)
: TimeSpan.FromSeconds(1);
}
private async Task<RobotsEntry> GetRobotsAsync(Uri uri, string userAgent)
{
string host = uri.Host;
string cacheKey = $"{CachePrefix}{host}";
try
{
var cached = await cache.GetAsync<RobotsEntry>(cacheKey);
if (cached != null && cached.ExpiresAt > DateTime.UtcNow)
{
return cached;
}
}
catch (Exception ex)
{
logger.LogWarning(ex, "Failed to retrieve or deserialize robots cache for {Host}", host);
}
var entry = await FetchAndParseRobotsAsync(uri, userAgent);
var cacheDuration = entry.IsAllowed ? TimeSpan.FromDays(1) : TimeSpan.FromMinutes(10);
try
{
await cache.SetAsync(cacheKey, entry, cacheDuration);
}
catch (Exception ex)
{
logger.LogError(ex, "Failed to save robots entry to cache for {Host}", host);
}
return entry;
}
private async Task<RobotsEntry> FetchAndParseRobotsAsync(Uri uri, string userAgent)
{
var client = clientFactory.CreateClient();
var robotsUrl = $"{uri.Scheme}://{uri.Host}/robots.txt";
client.DefaultRequestHeaders.UserAgent.ParseAdd(
"Mozilla/5.0 (compatible; NewsArchiveBot/1.0; +http://lexwells.com/bot)");
try
{
var response = await client.GetStringAsync(robotsUrl);
return ParseRobotsContent(response, userAgent, uri.AbsolutePath);
}
catch (HttpRequestException ex) when (ex.StatusCode == HttpStatusCode.NotFound)
{
return new RobotsEntry(true, null, null, DateTime.UtcNow.AddDays(1));
}
catch (Exception ex)
{
logger.LogError(ex, "Error fetching robots.txt from {Url}", robotsUrl);
return new RobotsEntry(false, null, null, DateTime.UtcNow.AddMinutes(10));
}
}
private RobotsEntry ParseRobotsContent(string content, string userAgent, string path)
{
bool isAllowed = true;
int? crawlDelay = null;
bool foundSpecificMatch = false;
string myBotName = userAgent.ToLower();
var lines = content.Split(new[] { "\n", "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
bool inRelevantSection = false;
foreach (var line in lines)
{
var cleanLine = line.Trim();
if (string.IsNullOrWhiteSpace(cleanLine) || cleanLine.StartsWith("#")) continue;
var parts = cleanLine.Split(':', 2);
if (parts.Length < 2) continue;
var key = parts[0].Trim().ToLower();
var value = parts[1].Trim();
if (key == "user-agent")
{
var currentAgent = value.ToLower();
if (currentAgent == myBotName)
{
inRelevantSection = true;
foundSpecificMatch = true;
isAllowed = true;
crawlDelay = null;
}
else if (currentAgent == "*" && !foundSpecificMatch)
{
inRelevantSection = true;
}
else
{
inRelevantSection = false;
}
continue;
}
if (inRelevantSection)
{
if (key == "disallow")
{
if (string.IsNullOrEmpty(value)) isAllowed = true;
else if (path.StartsWith(value, StringComparison.OrdinalIgnoreCase)) isAllowed = false;
}
else if (key == "allow")
{
if (path.StartsWith(value, StringComparison.OrdinalIgnoreCase)) isAllowed = true;
}
else if (key == "crawl-delay")
{
if (int.TryParse(value, out int delay)) crawlDelay = delay;
}
}
}
return new RobotsEntry(isAllowed, crawlDelay, null, DateTime.UtcNow.AddDays(1));
}
}