151 lines
5.0 KiB
C#
151 lines
5.0 KiB
C#
using System.Net;
|
|
using Microsoft.Extensions.Logging;
|
|
using LexWells.Infrastructure.Common.Interfaces;
|
|
using LexWells.Infrastructure.Common.Models;
|
|
|
|
namespace LexWells.Infrastructure.Common.Services;
|
|
|
|
public class RobotsService(
|
|
IHttpClientFactory clientFactory,
|
|
ICacheService cache,
|
|
ILogger<RobotsService> logger) : IRobotsService
|
|
{
|
|
private const string CachePrefix = "robots_";
|
|
private const string DefaultUserAgent = "NewsArchiveBot";
|
|
|
|
public async Task<bool> CanCrawlAsync(Uri uri, string userAgent = DefaultUserAgent)
|
|
{
|
|
var entry = await GetRobotsAsync(uri, userAgent);
|
|
return entry.IsAllowed;
|
|
}
|
|
|
|
public async Task<TimeSpan> GetCrawlDelayAsync(Uri uri, string userAgent = DefaultUserAgent)
|
|
{
|
|
var entry = await GetRobotsAsync(uri, userAgent);
|
|
return entry.CrawlDelay.HasValue
|
|
? TimeSpan.FromSeconds(entry.CrawlDelay.Value)
|
|
: TimeSpan.FromSeconds(1);
|
|
}
|
|
|
|
private async Task<RobotsEntry> GetRobotsAsync(Uri uri, string userAgent)
|
|
{
|
|
string host = uri.Host;
|
|
string cacheKey = $"{CachePrefix}{host}";
|
|
|
|
try
|
|
{
|
|
var cached = await cache.GetAsync<RobotsEntry>(cacheKey);
|
|
|
|
if (cached != null && cached.ExpiresAt > DateTime.UtcNow)
|
|
{
|
|
return cached;
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
logger.LogWarning(ex, "Failed to retrieve or deserialize robots cache for {Host}", host);
|
|
}
|
|
|
|
var entry = await FetchAndParseRobotsAsync(uri, userAgent);
|
|
|
|
var cacheDuration = entry.IsAllowed ? TimeSpan.FromDays(1) : TimeSpan.FromMinutes(10);
|
|
|
|
try
|
|
{
|
|
await cache.SetAsync(cacheKey, entry, cacheDuration);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
logger.LogError(ex, "Failed to save robots entry to cache for {Host}", host);
|
|
}
|
|
|
|
return entry;
|
|
}
|
|
|
|
private async Task<RobotsEntry> FetchAndParseRobotsAsync(Uri uri, string userAgent)
|
|
{
|
|
var client = clientFactory.CreateClient();
|
|
var robotsUrl = $"{uri.Scheme}://{uri.Host}/robots.txt";
|
|
|
|
client.DefaultRequestHeaders.UserAgent.ParseAdd(
|
|
"Mozilla/5.0 (compatible; NewsArchiveBot/1.0; +http://lexwells.com/bot)");
|
|
|
|
try
|
|
{
|
|
var response = await client.GetStringAsync(robotsUrl);
|
|
return ParseRobotsContent(response, userAgent, uri.AbsolutePath);
|
|
}
|
|
catch (HttpRequestException ex) when (ex.StatusCode == HttpStatusCode.NotFound)
|
|
{
|
|
return new RobotsEntry(true, null, null, DateTime.UtcNow.AddDays(1));
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
logger.LogError(ex, "Error fetching robots.txt from {Url}", robotsUrl);
|
|
return new RobotsEntry(false, null, null, DateTime.UtcNow.AddMinutes(10));
|
|
}
|
|
}
|
|
|
|
private RobotsEntry ParseRobotsContent(string content, string userAgent, string path)
|
|
{
|
|
bool isAllowed = true;
|
|
int? crawlDelay = null;
|
|
bool foundSpecificMatch = false;
|
|
string myBotName = userAgent.ToLower();
|
|
|
|
var lines = content.Split(new[] { "\n", "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
|
|
bool inRelevantSection = false;
|
|
|
|
foreach (var line in lines)
|
|
{
|
|
var cleanLine = line.Trim();
|
|
if (string.IsNullOrWhiteSpace(cleanLine) || cleanLine.StartsWith("#")) continue;
|
|
|
|
var parts = cleanLine.Split(':', 2);
|
|
if (parts.Length < 2) continue;
|
|
|
|
var key = parts[0].Trim().ToLower();
|
|
var value = parts[1].Trim();
|
|
|
|
if (key == "user-agent")
|
|
{
|
|
var currentAgent = value.ToLower();
|
|
if (currentAgent == myBotName)
|
|
{
|
|
inRelevantSection = true;
|
|
foundSpecificMatch = true;
|
|
isAllowed = true;
|
|
crawlDelay = null;
|
|
}
|
|
else if (currentAgent == "*" && !foundSpecificMatch)
|
|
{
|
|
inRelevantSection = true;
|
|
}
|
|
else
|
|
{
|
|
inRelevantSection = false;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (inRelevantSection)
|
|
{
|
|
if (key == "disallow")
|
|
{
|
|
if (string.IsNullOrEmpty(value)) isAllowed = true;
|
|
else if (path.StartsWith(value, StringComparison.OrdinalIgnoreCase)) isAllowed = false;
|
|
}
|
|
else if (key == "allow")
|
|
{
|
|
if (path.StartsWith(value, StringComparison.OrdinalIgnoreCase)) isAllowed = true;
|
|
}
|
|
else if (key == "crawl-delay")
|
|
{
|
|
if (int.TryParse(value, out int delay)) crawlDelay = delay;
|
|
}
|
|
}
|
|
}
|
|
|
|
return new RobotsEntry(isAllowed, crawlDelay, null, DateTime.UtcNow.AddDays(1));
|
|
}
|
|
} |