using System.Net;
using System.Net.Sockets;
using System.Text.RegularExpressions;
using System.Web;
using Microsoft.Extensions.Caching.Memory;
using MyHomePage.Api.Common;
using MyHomePage.Api.Infrastructure.Configuration;
using Microsoft.Extensions.Options;
namespace MyHomePage.Api.Services;
///
/// 自动抓取网站 favicon。
/// P31 主链路:BookmarkService.Create/Update 检测「未指定图标」时调用本服务:
/// 1. HTTP GET 目标页面(限制 5s / 1MB,User-Agent 模拟浏览器)
/// 2. 解析 HTML <link rel="icon"> / apple-touch-icon / shortcut icon
/// 3. 按优先级选最佳 icon(apple-touch > sizes 最大 > /favicon.ico 兜底)
/// 4. 下载 icon 图片到 Upload/favicons/ 目录
/// 5. 返回前端可访问的 URL(保存到 bookmark.IconUrl + iconType='favicon')
/// SSRF 防护:拒绝内网 / 本地 / 链路本地地址。
/// 失败时返回 null(不抛异常),由调用方走默认图标。
///
public class FaviconService
{
private readonly IUploadService _upload;
private readonly IMemoryCache _cache;
private readonly UploadOptions _uploadOptions;
private readonly ILogger _logger;
/// 缓存键前缀 + 缓存时长(同一 URL 24h 内不再重抓)
private static readonly TimeSpan CacheTtl = TimeSpan.FromHours(24);
private const string CacheKeyPrefix = "favicon:";
/// UA 字符串:模拟常见浏览器,避免被部分站点拒绝
private const string UserAgent =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36";
/// 下载的 icon 大小上限(5MB)
private const long MaxIconBytes = 5L * 1024 * 1024;
/// HttpClient 名字(与 Program.cs AddHttpClient(name) 对应)
private const string HttpClientName = nameof(FaviconService);
private readonly IHttpClientFactory _httpFactory;
public FaviconService(
IHttpClientFactory httpFactory,
IUploadService upload,
IMemoryCache cache,
IOptions uploadOptions,
ILogger logger)
{
_httpFactory = httpFactory;
_upload = upload;
_cache = cache;
_uploadOptions = uploadOptions.Value;
_logger = logger;
}
/// 每次调用前从 factory 取一个新 HttpClient(短生命周期,由 factory 池化)
private HttpClient NewClient() => _httpFactory.CreateClient(HttpClientName);
///
/// 抓取 pageUrl 的 favicon 并保存到 upload 目录,返回前端可访问的 URL。
/// 任何环节失败均返回 null(不抛异常,由调用方静默用默认图标)。
///
public async Task FetchAndSaveAsync(string pageUrl, CancellationToken ct = default)
{
if (string.IsNullOrWhiteSpace(pageUrl)) return null;
if (!Uri.TryCreate(pageUrl, UriKind.Absolute, out var pageUri)) return null;
if (pageUri.Scheme != Uri.UriSchemeHttp && pageUri.Scheme != Uri.UriSchemeHttps) return null;
var cacheKey = CacheKeyPrefix + pageUri.Host + pageUri.AbsolutePath;
if (_cache.TryGetValue(cacheKey, out var cached))
{
_logger.LogDebug("Favicon cache hit: {Url} → {Icon}", pageUrl, cached ?? "(null)");
return cached;
}
try
{
var iconUrl = await FetchIconUrlAsync(pageUri, ct);
if (string.IsNullOrEmpty(iconUrl)) { /* P51 临时:禁用负缓存以便重复请求能拿到新结果 CacheNull(cacheKey); */ return null; }
var saved = await DownloadAndSaveAsync(iconUrl, pageUri, ct);
if (saved == null) { /* P51 临时:禁用负缓存以便重复请求能拿到新结果 CacheNull(cacheKey); */ return null; }
_cache.Set(cacheKey, saved, CacheTtl);
_logger.LogInformation("Favicon fetched: {Page} → {Icon}", pageUrl, saved);
return saved;
}
catch (Exception ex)
{
// P51 修复:LogWarning → LogError(docker logs 默认级别是 Information 看不到 warning 堆栈),
// 并附上 UploadOptions.Path 实际值,方便排查容器内 /uploads 权限 / 路径覆盖问题
_logger.LogError(ex,
"Favicon fetch failed: {Url} | UploadOptions.Path='{OptPath}' (env={Env})",
pageUrl, _uploadOptions.Path, Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT") ?? "(default)");
return null;
}
}
private void CacheNull(string key) => _cache.Set(key, (string?)null, TimeSpan.FromMinutes(10));
///
/// 主流程:抓 HTML → 解析 link → 选最佳 icon URL。
///
private async Task FetchIconUrlAsync(Uri pageUri, CancellationToken ct)
{
// 1. GET 页面(限 1MB)
var html = await FetchHtmlAsync(pageUri, ct);
if (string.IsNullOrEmpty(html)) return null;
// 2. 解析 link tags
var links = ParseIconLinks(html, pageUri);
// 3. 按优先级选最佳
if (links.Count == 0)
{
// 兜底:直接尝试 /favicon.ico
return new Uri(pageUri, "/favicon.ico").ToString();
}
// 优先级:apple-touch-icon > icon(type=image/* sizes 最大) > shortcut icon > 其他
var best = links
.OrderByDescending(l => l.Priority)
.ThenByDescending(l => l.Score)
.FirstOrDefault();
return best?.Url;
}
/// 抓取页面 HTML(限 1MB,5s 超时)
private async Task FetchHtmlAsync(Uri pageUri, CancellationToken ct)
{
if (await IsPrivateOrLocalhostAsync(pageUri, ct)) return null;
using var _http = NewClient();
using var req = new HttpRequestMessage(HttpMethod.Get, pageUri);
req.Headers.Add("User-Agent", UserAgent);
req.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
req.Headers.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
using var resp = await _http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct);
// P33:详细日志 — 让主人能看清楚拿到的 HTML 是什么(含 location 跳转到哪)
_logger.LogInformation("Favicon fetch HTML: {Url} → {Status} {ContentType} ({Len} bytes)",
pageUri, (int)resp.StatusCode, resp.Content.Headers.ContentType?.MediaType ?? "?",
resp.Content.Headers.ContentLength ?? -1);
if (!resp.IsSuccessStatusCode)
{
_logger.LogDebug("Favicon fetch: {Url} returned {Status}, skip", pageUri, resp.StatusCode);
return null;
}
// 限制 content-length
var contentLength = resp.Content.Headers.ContentLength;
if (contentLength.HasValue && contentLength.Value > 1024 * 1024) return null;
await using var stream = await resp.Content.ReadAsStreamAsync(ct);
var buffer = new byte[1024 * 1024];
var total = 0;
int read;
while (total < buffer.Length && (read = await stream.ReadAsync(buffer.AsMemory(total, buffer.Length - total), ct)) > 0)
{
total += read;
}
// 尝试解析为 HTML(先看 charset)
var charset = resp.Content.Headers.ContentType?.CharSet ?? "utf-8";
string html;
try
{
html = System.Text.Encoding.GetEncoding(charset).GetString(buffer, 0, total);
}
catch
{
html = System.Text.Encoding.UTF8.GetString(buffer, 0, total);
}
// P33:HTML 长度 + 是否含 favicon 关键字(方便定位"是否真的没找到")
var hasIconTag = html.Contains("rel=\"icon\"", StringComparison.OrdinalIgnoreCase)
|| html.Contains("rel='icon'", StringComparison.OrdinalIgnoreCase)
|| html.Contains("rel=\"alternate icon\"", StringComparison.OrdinalIgnoreCase);
_logger.LogDebug("Favicon HTML scan: {Url} len={Len} contains-icon-link={Has}",
pageUri, total, hasIconTag);
if (!hasIconTag)
{
// 截取 HTML 前 200 字符方便主人看是被什么页面拦了(如 FN Connect 反向代理页)
_logger.LogWarning("Favicon HTML has no : {Url} → first 200 chars: {Snippet}",
pageUri, html.Length > 0 ? html.Substring(0, Math.Min(200, html.Length)) : "(empty)");
}
return html;
}
///
/// 解析 HTML 中的 favicon 链接。
/// P33 改进:
/// - 正则支持 rel / href 任意顺序(之前要求 rel 在前,对 href 在前的写法失败)
/// - priority 映射支持 `alternate icon` / `fluid-icon` 等包含 icon 关键字的 rel
/// - 同时解析 <meta property="og:image"> 作为兜底
/// - 加详细日志,方便定位"为什么没抓到"
///
private List ParseIconLinks(string html, Uri baseUri)
{
var results = new List();
// ===== 第一步:解析 =====
// 用 .*? 懒匹配 rel/href 任意顺序;属性值允许 "..."/'...' 两种引号
var linkPattern = new Regex(
@"]*?)/?>", // 整个 块(包括自闭合 />)
RegexOptions.IgnoreCase | RegexOptions.Compiled);
// P33 关键修复:属性名匹配前用 (? maxSize) maxSize = sz;
}
}
}
}
// 解析绝对 URL
if (!Uri.TryCreate(baseUri, href, out var absoluteUri)) continue;
if (absoluteUri.Scheme != Uri.UriSchemeHttp && absoluteUri.Scheme != Uri.UriSchemeHttps) continue;
// P33 改进:根据 rel 包含的关键字判定 priority
int priority;
int score;
if (relLower.Contains("apple-touch"))
{
priority = 300;
score = maxSize > 0 ? maxSize : 180;
}
else if (relLower == "shortcut icon")
{
priority = 100;
score = maxSize;
}
else if (relLower == "icon")
{
priority = 200;
score = maxSize;
}
else if (relLower.Contains("icon"))
{
// 兜底:alternate icon / fluid-icon / icon-zzz 等
priority = 150;
score = maxSize;
}
else
{
priority = 50;
score = maxSize;
}
_logger.LogDebug("Favicon link candidate: rel={Rel} href={Href} sizes={Sizes} → priority={P} score={S}",
relLower, absoluteUri, sizes ?? "-", priority, score);
results.Add(new IconLink
{
Url = absoluteUri.ToString(),
Priority = priority,
Score = score
});
}
// ===== 第二步:兜底 =====
// 很多现代站点(特别是博客/文档站)有 og:image,作为 icon 兜底
var ogPattern = new Regex(
@"]*?\bproperty\s*=\s*[""']og:image[""'][^>]*?\bcontent\s*=\s*[""']([^""']+)[""']",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
// 也匹配 content 在前的写法
var ogPatternAlt = new Regex(
@"]*?\bcontent\s*=\s*[""']([^""']+)[""'][^>]*?\bproperty\s*=\s*[""']og:image[""']",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
string? ogImage = null;
var ogMatch = ogPattern.Match(html);
if (ogMatch.Success) ogImage = ogMatch.Groups[1].Value;
else
{
var ogMatchAlt = ogPatternAlt.Match(html);
if (ogMatchAlt.Success) ogImage = ogMatchAlt.Groups[1].Value;
}
if (!string.IsNullOrEmpty(ogImage) && Uri.TryCreate(baseUri, ogImage, out var ogUri)
&& (ogUri.Scheme == Uri.UriSchemeHttp || ogUri.Scheme == Uri.UriSchemeHttps))
{
_logger.LogDebug("Favicon og:image fallback: {Url}", ogUri);
results.Add(new IconLink
{
Url = ogUri.ToString(),
Priority = 30, // 比 link 兜底还低,避免抢了真正的 favicon
Score = 0
});
}
return results;
}
/// 下载 icon 图片并保存到 upload 目录
private async Task DownloadAndSaveAsync(string iconUrl, Uri pageUri, CancellationToken ct)
{
if (!Uri.TryCreate(iconUrl, UriKind.Absolute, out var iconUri)) return null;
if (iconUri.Scheme != Uri.UriSchemeHttp && iconUri.Scheme != Uri.UriSchemeHttps) return null;
if (await IsPrivateOrLocalhostAsync(iconUri, ct)) return null;
using var _http = NewClient();
using var req = new HttpRequestMessage(HttpMethod.Get, iconUri);
req.Headers.Add("User-Agent", UserAgent);
req.Headers.Add("Referer", pageUri.Scheme + "://" + pageUri.Host);
using var resp = await _http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct);
if (!resp.IsSuccessStatusCode) return null;
// content-type 校验
var contentType = resp.Content.Headers.ContentType?.MediaType ?? "";
if (!contentType.StartsWith("image/", StringComparison.OrdinalIgnoreCase) &&
!contentType.Equals("application/octet-stream", StringComparison.OrdinalIgnoreCase))
{
return null;
}
// 限制 content-length
var contentLength = resp.Content.Headers.ContentLength;
if (contentLength.HasValue && contentLength.Value > MaxIconBytes) return null;
await using var stream = await resp.Content.ReadAsStreamAsync(ct);
// 用 MemoryStream 缓冲以同时拿到 content-type
using var ms = new MemoryStream();
var buffer = new byte[81920];
long total = 0;
int read;
while (total < MaxIconBytes && (read = await stream.ReadAsync(buffer, 0, (int)Math.Min(buffer.Length, MaxIconBytes - total))) > 0)
{
ms.Write(buffer, 0, read);
total += read;
}
if (total == 0 || total >= MaxIconBytes) return null;
ms.Position = 0;
// 文件名:从 iconUrl 推断,最后一段
var fileName = Path.GetFileName(iconUri.AbsolutePath);
if (string.IsNullOrEmpty(fileName) || fileName == "/") fileName = "favicon";
var result = await _upload.SaveStreamAsync(ms, fileName, contentType, subDir: "favicons");
return result.Url;
}
/// SSRF 防护:解析域名 IP,拒绝内网/本地/链路本地
private async Task IsPrivateOrLocalhostAsync(Uri uri, CancellationToken ct)
{
try
{
// localhost 字面
if (uri.HostNameType == UriHostNameType.Basic)
{
if (uri.Host.Equals("localhost", StringComparison.OrdinalIgnoreCase)) return true;
}
// 解析为 IP
IPAddress[] addresses;
try
{
addresses = await Dns.GetHostAddressesAsync(uri.Host, ct);
}
catch
{
return true; // 解析失败视为不安全
}
foreach (var ip in addresses)
{
if (IsPrivateOrLocalIp(ip)) return true;
}
return false;
}
catch
{
return true;
}
}
private static bool IsPrivateOrLocalIp(IPAddress ip)
{
if (IPAddress.IsLoopback(ip)) return true;
if (ip.AddressFamily == AddressFamily.InterNetwork)
{
var bytes = ip.GetAddressBytes();
// 10.0.0.0/8
if (bytes[0] == 10) return true;
// 172.16.0.0/12
if (bytes[0] == 172 && bytes[1] >= 16 && bytes[1] <= 31) return true;
// 192.168.0.0/16
if (bytes[0] == 192 && bytes[1] == 168) return true;
// 169.254.0.0/16 (link-local)
if (bytes[0] == 169 && bytes[1] == 254) return true;
// 0.0.0.0
if (bytes[0] == 0 && bytes[1] == 0 && bytes[2] == 0 && bytes[3] == 0) return true;
}
return false;
}
private class IconLink
{
public string Url { get; set; } = string.Empty;
public int Priority { get; set; }
public int Score { get; set; }
}
}