using System.Net; using System.Net.Sockets; using System.Text.RegularExpressions; using System.Web; using Microsoft.Extensions.Caching.Memory; using MyHomePage.Api.Common; using MyHomePage.Api.Infrastructure.Configuration; using Microsoft.Extensions.Options; namespace MyHomePage.Api.Services; /// /// 自动抓取网站 favicon。 /// P31 主链路:BookmarkService.Create/Update 检测「未指定图标」时调用本服务: /// 1. HTTP GET 目标页面(限制 5s / 1MB,User-Agent 模拟浏览器) /// 2. 解析 HTML <link rel="icon"> / apple-touch-icon / shortcut icon /// 3. 按优先级选最佳 icon(apple-touch > sizes 最大 > /favicon.ico 兜底) /// 4. 下载 icon 图片到 Upload/favicons/ 目录 /// 5. 返回前端可访问的 URL(保存到 bookmark.IconUrl + iconType='favicon') /// SSRF 防护:拒绝内网 / 本地 / 链路本地地址。 /// 失败时返回 null(不抛异常),由调用方走默认图标。 /// public class FaviconService { private readonly IUploadService _upload; private readonly IMemoryCache _cache; private readonly UploadOptions _uploadOptions; private readonly ILogger _logger; /// 缓存键前缀 + 缓存时长(同一 URL 24h 内不再重抓) private static readonly TimeSpan CacheTtl = TimeSpan.FromHours(24); private const string CacheKeyPrefix = "favicon:"; /// UA 字符串:模拟常见浏览器,避免被部分站点拒绝 private const string UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"; /// 下载的 icon 大小上限(5MB) private const long MaxIconBytes = 5L * 1024 * 1024; /// HttpClient 名字(与 Program.cs AddHttpClient(name) 对应) private const string HttpClientName = nameof(FaviconService); private readonly IHttpClientFactory _httpFactory; public FaviconService( IHttpClientFactory httpFactory, IUploadService upload, IMemoryCache cache, IOptions uploadOptions, ILogger logger) { _httpFactory = httpFactory; _upload = upload; _cache = cache; _uploadOptions = uploadOptions.Value; _logger = logger; } /// 每次调用前从 factory 取一个新 HttpClient(短生命周期,由 factory 池化) private HttpClient NewClient() => _httpFactory.CreateClient(HttpClientName); /// /// 抓取 pageUrl 的 favicon 并保存到 upload 目录,返回前端可访问的 URL。 /// 任何环节失败均返回 null(不抛异常,由调用方静默用默认图标)。 /// public async Task FetchAndSaveAsync(string pageUrl, CancellationToken ct = default) { if (string.IsNullOrWhiteSpace(pageUrl)) return null; if (!Uri.TryCreate(pageUrl, UriKind.Absolute, out var pageUri)) return null; if (pageUri.Scheme != Uri.UriSchemeHttp && pageUri.Scheme != Uri.UriSchemeHttps) return null; var cacheKey = CacheKeyPrefix + pageUri.Host + pageUri.AbsolutePath; if (_cache.TryGetValue(cacheKey, out var cached)) { _logger.LogDebug("Favicon cache hit: {Url} → {Icon}", pageUrl, cached ?? "(null)"); return cached; } try { var iconUrl = await FetchIconUrlAsync(pageUri, ct); if (string.IsNullOrEmpty(iconUrl)) { /* P51 临时:禁用负缓存以便重复请求能拿到新结果 CacheNull(cacheKey); */ return null; } var saved = await DownloadAndSaveAsync(iconUrl, pageUri, ct); if (saved == null) { /* P51 临时:禁用负缓存以便重复请求能拿到新结果 CacheNull(cacheKey); */ return null; } _cache.Set(cacheKey, saved, CacheTtl); _logger.LogInformation("Favicon fetched: {Page} → {Icon}", pageUrl, saved); return saved; } catch (Exception ex) { // P51 修复:LogWarning → LogError(docker logs 默认级别是 Information 看不到 warning 堆栈), // 并附上 UploadOptions.Path 实际值,方便排查容器内 /uploads 权限 / 路径覆盖问题 _logger.LogError(ex, "Favicon fetch failed: {Url} | UploadOptions.Path='{OptPath}' (env={Env})", pageUrl, _uploadOptions.Path, Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT") ?? "(default)"); return null; } } private void CacheNull(string key) => _cache.Set(key, (string?)null, TimeSpan.FromMinutes(10)); /// /// 主流程:抓 HTML → 解析 link → 选最佳 icon URL。 /// private async Task FetchIconUrlAsync(Uri pageUri, CancellationToken ct) { // 1. GET 页面(限 1MB) var html = await FetchHtmlAsync(pageUri, ct); if (string.IsNullOrEmpty(html)) return null; // 2. 解析 link tags var links = ParseIconLinks(html, pageUri); // 3. 按优先级选最佳 if (links.Count == 0) { // 兜底:直接尝试 /favicon.ico return new Uri(pageUri, "/favicon.ico").ToString(); } // 优先级:apple-touch-icon > icon(type=image/* sizes 最大) > shortcut icon > 其他 var best = links .OrderByDescending(l => l.Priority) .ThenByDescending(l => l.Score) .FirstOrDefault(); return best?.Url; } /// 抓取页面 HTML(限 1MB,5s 超时) private async Task FetchHtmlAsync(Uri pageUri, CancellationToken ct) { if (await IsPrivateOrLocalhostAsync(pageUri, ct)) return null; using var _http = NewClient(); using var req = new HttpRequestMessage(HttpMethod.Get, pageUri); req.Headers.Add("User-Agent", UserAgent); req.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); req.Headers.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"); using var resp = await _http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct); // P33:详细日志 — 让主人能看清楚拿到的 HTML 是什么(含 location 跳转到哪) _logger.LogInformation("Favicon fetch HTML: {Url} → {Status} {ContentType} ({Len} bytes)", pageUri, (int)resp.StatusCode, resp.Content.Headers.ContentType?.MediaType ?? "?", resp.Content.Headers.ContentLength ?? -1); if (!resp.IsSuccessStatusCode) { _logger.LogDebug("Favicon fetch: {Url} returned {Status}, skip", pageUri, resp.StatusCode); return null; } // 限制 content-length var contentLength = resp.Content.Headers.ContentLength; if (contentLength.HasValue && contentLength.Value > 1024 * 1024) return null; await using var stream = await resp.Content.ReadAsStreamAsync(ct); var buffer = new byte[1024 * 1024]; var total = 0; int read; while (total < buffer.Length && (read = await stream.ReadAsync(buffer.AsMemory(total, buffer.Length - total), ct)) > 0) { total += read; } // 尝试解析为 HTML(先看 charset) var charset = resp.Content.Headers.ContentType?.CharSet ?? "utf-8"; string html; try { html = System.Text.Encoding.GetEncoding(charset).GetString(buffer, 0, total); } catch { html = System.Text.Encoding.UTF8.GetString(buffer, 0, total); } // P33:HTML 长度 + 是否含 favicon 关键字(方便定位"是否真的没找到") var hasIconTag = html.Contains("rel=\"icon\"", StringComparison.OrdinalIgnoreCase) || html.Contains("rel='icon'", StringComparison.OrdinalIgnoreCase) || html.Contains("rel=\"alternate icon\"", StringComparison.OrdinalIgnoreCase); _logger.LogDebug("Favicon HTML scan: {Url} len={Len} contains-icon-link={Has}", pageUri, total, hasIconTag); if (!hasIconTag) { // 截取 HTML 前 200 字符方便主人看是被什么页面拦了(如 FN Connect 反向代理页) _logger.LogWarning("Favicon HTML has no : {Url} → first 200 chars: {Snippet}", pageUri, html.Length > 0 ? html.Substring(0, Math.Min(200, html.Length)) : "(empty)"); } return html; } /// /// 解析 HTML 中的 favicon 链接。 /// P33 改进: /// - 正则支持 rel / href 任意顺序(之前要求 rel 在前,对 href 在前的写法失败) /// - priority 映射支持 `alternate icon` / `fluid-icon` 等包含 icon 关键字的 rel /// - 同时解析 <meta property="og:image"> 作为兜底 /// - 加详细日志,方便定位"为什么没抓到" /// private List ParseIconLinks(string html, Uri baseUri) { var results = new List(); // ===== 第一步:解析 ===== // 用 .*? 懒匹配 rel/href 任意顺序;属性值允许 "..."/'...' 两种引号 var linkPattern = new Regex( @"]*?)/?>", // 整个 块(包括自闭合 />) RegexOptions.IgnoreCase | RegexOptions.Compiled); // P33 关键修复:属性名匹配前用 (? maxSize) maxSize = sz; } } } } // 解析绝对 URL if (!Uri.TryCreate(baseUri, href, out var absoluteUri)) continue; if (absoluteUri.Scheme != Uri.UriSchemeHttp && absoluteUri.Scheme != Uri.UriSchemeHttps) continue; // P33 改进:根据 rel 包含的关键字判定 priority int priority; int score; if (relLower.Contains("apple-touch")) { priority = 300; score = maxSize > 0 ? maxSize : 180; } else if (relLower == "shortcut icon") { priority = 100; score = maxSize; } else if (relLower == "icon") { priority = 200; score = maxSize; } else if (relLower.Contains("icon")) { // 兜底:alternate icon / fluid-icon / icon-zzz 等 priority = 150; score = maxSize; } else { priority = 50; score = maxSize; } _logger.LogDebug("Favicon link candidate: rel={Rel} href={Href} sizes={Sizes} → priority={P} score={S}", relLower, absoluteUri, sizes ?? "-", priority, score); results.Add(new IconLink { Url = absoluteUri.ToString(), Priority = priority, Score = score }); } // ===== 第二步:兜底 ===== // 很多现代站点(特别是博客/文档站)有 og:image,作为 icon 兜底 var ogPattern = new Regex( @"]*?\bproperty\s*=\s*[""']og:image[""'][^>]*?\bcontent\s*=\s*[""']([^""']+)[""']", RegexOptions.IgnoreCase | RegexOptions.Compiled); // 也匹配 content 在前的写法 var ogPatternAlt = new Regex( @"]*?\bcontent\s*=\s*[""']([^""']+)[""'][^>]*?\bproperty\s*=\s*[""']og:image[""']", RegexOptions.IgnoreCase | RegexOptions.Compiled); string? ogImage = null; var ogMatch = ogPattern.Match(html); if (ogMatch.Success) ogImage = ogMatch.Groups[1].Value; else { var ogMatchAlt = ogPatternAlt.Match(html); if (ogMatchAlt.Success) ogImage = ogMatchAlt.Groups[1].Value; } if (!string.IsNullOrEmpty(ogImage) && Uri.TryCreate(baseUri, ogImage, out var ogUri) && (ogUri.Scheme == Uri.UriSchemeHttp || ogUri.Scheme == Uri.UriSchemeHttps)) { _logger.LogDebug("Favicon og:image fallback: {Url}", ogUri); results.Add(new IconLink { Url = ogUri.ToString(), Priority = 30, // 比 link 兜底还低,避免抢了真正的 favicon Score = 0 }); } return results; } /// 下载 icon 图片并保存到 upload 目录 private async Task DownloadAndSaveAsync(string iconUrl, Uri pageUri, CancellationToken ct) { if (!Uri.TryCreate(iconUrl, UriKind.Absolute, out var iconUri)) return null; if (iconUri.Scheme != Uri.UriSchemeHttp && iconUri.Scheme != Uri.UriSchemeHttps) return null; if (await IsPrivateOrLocalhostAsync(iconUri, ct)) return null; using var _http = NewClient(); using var req = new HttpRequestMessage(HttpMethod.Get, iconUri); req.Headers.Add("User-Agent", UserAgent); req.Headers.Add("Referer", pageUri.Scheme + "://" + pageUri.Host); using var resp = await _http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct); if (!resp.IsSuccessStatusCode) return null; // content-type 校验 var contentType = resp.Content.Headers.ContentType?.MediaType ?? ""; if (!contentType.StartsWith("image/", StringComparison.OrdinalIgnoreCase) && !contentType.Equals("application/octet-stream", StringComparison.OrdinalIgnoreCase)) { return null; } // 限制 content-length var contentLength = resp.Content.Headers.ContentLength; if (contentLength.HasValue && contentLength.Value > MaxIconBytes) return null; await using var stream = await resp.Content.ReadAsStreamAsync(ct); // 用 MemoryStream 缓冲以同时拿到 content-type using var ms = new MemoryStream(); var buffer = new byte[81920]; long total = 0; int read; while (total < MaxIconBytes && (read = await stream.ReadAsync(buffer, 0, (int)Math.Min(buffer.Length, MaxIconBytes - total))) > 0) { ms.Write(buffer, 0, read); total += read; } if (total == 0 || total >= MaxIconBytes) return null; ms.Position = 0; // 文件名:从 iconUrl 推断,最后一段 var fileName = Path.GetFileName(iconUri.AbsolutePath); if (string.IsNullOrEmpty(fileName) || fileName == "/") fileName = "favicon"; var result = await _upload.SaveStreamAsync(ms, fileName, contentType, subDir: "favicons"); return result.Url; } /// SSRF 防护:解析域名 IP,拒绝内网/本地/链路本地 private async Task IsPrivateOrLocalhostAsync(Uri uri, CancellationToken ct) { try { // localhost 字面 if (uri.HostNameType == UriHostNameType.Basic) { if (uri.Host.Equals("localhost", StringComparison.OrdinalIgnoreCase)) return true; } // 解析为 IP IPAddress[] addresses; try { addresses = await Dns.GetHostAddressesAsync(uri.Host, ct); } catch { return true; // 解析失败视为不安全 } foreach (var ip in addresses) { if (IsPrivateOrLocalIp(ip)) return true; } return false; } catch { return true; } } private static bool IsPrivateOrLocalIp(IPAddress ip) { if (IPAddress.IsLoopback(ip)) return true; if (ip.AddressFamily == AddressFamily.InterNetwork) { var bytes = ip.GetAddressBytes(); // 10.0.0.0/8 if (bytes[0] == 10) return true; // 172.16.0.0/12 if (bytes[0] == 172 && bytes[1] >= 16 && bytes[1] <= 31) return true; // 192.168.0.0/16 if (bytes[0] == 192 && bytes[1] == 168) return true; // 169.254.0.0/16 (link-local) if (bytes[0] == 169 && bytes[1] == 254) return true; // 0.0.0.0 if (bytes[0] == 0 && bytes[1] == 0 && bytes[2] == 0 && bytes[3] == 0) return true; } return false; } private class IconLink { public string Url { get; set; } = string.Empty; public int Priority { get; set; } public int Score { get; set; } } }