C#批量下载图片与HTML转Word技术实践
HttpClient网络请求基础
C#中的HttpClient类为HTTP通信提供了现代化的异步API。该类支持GET、POST等标准HTTP方法,并内置了异步处理机制。
using System.Net.Http;
using System.Threading.Tasks;
public class NetworkClient
{
private static readonly HttpClient client = new HttpClient();
public async Task<string> FetchContentAsync(string url)
{
try
{
HttpResponseMessage response = await client.GetAsync(url);
response.EnsureSuccessStatusCode();
return await response.Content.ReadAsStringAsync();
}
catch (HttpRequestException ex)
{
throw new NetworkException($"请求失败: {ex.Message}", ex);
}
}
}
图片批量下载实现
批量下载的核心在于并发控制和错误处理。通过SemaphoreSlim可以限制同时执行的下载任务数量。
public class ImageDownloader
{
private readonly HttpClient httpClient;
private readonly SemaphoreSlim semaphore;
public ImageDownloader(int maxConcurrency = 5)
{
httpClient = new HttpClient();
semaphore = new SemaphoreSlim(maxConcurrency);
}
public async Task DownloadImagesAsync(IEnumerable<string> urls, string saveDirectory)
{
var tasks = urls.Select(url => DownloadSingleImageAsync(url, saveDirectory));
await Task.WhenAll(tasks);
}
private async Task DownloadSingleImageAsync(string url, string directory)
{
await semaphore.WaitAsync();
try
{
byte[] imageData = await httpClient.GetByteArrayAsync(url);
string fileName = Path.GetFileName(url);
string fullPath = Path.Combine(directory, fileName);
await File.WriteAllBytesAsync(fullPath, imageData);
}
finally
{
semaphore.Release();
}
}
}
网页图片URL提取
使用HtmlAgilityPack解析HTML文档,提取所有图片资源链接:
public class ImageUrlParser
{
public IEnumerable<string> ExtractImageUrls(string htmlContent)
{
var doc = new HtmlDocument();
doc.LoadHtml(htmlContent);
var imageNodes = doc.DocumentNode.SelectNodes("//img[@src]");
if (imageNodes == null) return Enumerable.Empty<string>();
return imageNodes.Select(node => node.GetAttributeValue("src", ""))
.Where(src => !string.IsNullOrEmpty(src));
}
}
HTML到Word文档转换
使用Open XML SDK可以直接操作Word文档结构,实现精确的格式控制。
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
public class WordDocumentCreator
{
public void CreateFromHtml(string htmlContent, string outputPath)
{
using (var document = WordprocessingDocument.Create(outputPath, WordprocessingDocumentType.Document))
{
document.AddMainDocumentPart();
document.MainDocumentPart.Document = new Document();
document.MainDocumentPart.Document.Body = new Body();
var body = document.MainDocumentPart.Document.Body;
var paragraphs = ParseHtmlToParagraphs(htmlContent);
foreach (var paragraph in paragraphs)
{
body.Append(paragraph);
}
}
}
private IEnumerable<Paragraph> ParseHtmlToParagraphs(string html)
{
var doc = new HtmlDocument();
doc.LoadHtml(html);
foreach (var node in doc.DocumentNode.ChildNodes)
{
yield return ConvertNodeToParagraph(node);
}
}
private Paragraph ConvertNodeToParagraph(HtmlNode node)
{
var text = node.InnerText;
var run = new Run(new Text(text));
return new Paragraph(new Run[] { run });
}
}
样式映射处理
将HTML标签映射到Word样式需要建立转换规则:
public class StyleMapper
{
private static readonly Dictionary<string, string> TagToStyleMap =
new Dictionary<string, string>
{
{"h1", "Heading1"},
{"h2", "Heading2"},
{"p", "Normal"},
{"strong", "Strong"}
};
public string MapTagToStyle(string tagName)
{
return TagToStyleMap.ContainsKey(tagName) ?
TagToStyleMap[tagName] : "Normal";
}
}
异常处理与重试机制
网络操作的稳定性需要完善的错误处理和重试策略:
public class ResilientDownloader
{
private readonly HttpClient client;
private readonly int maxRetries;
public ResilientDownloader(int retries = 3)
{
client = new HttpClient();
maxRetries = retries;
}
public async Task<byte[]> DownloadWithRetryAsync(string url)
{
for (int attempt = 0; attempt < maxRetries; attempt++)
{
try
{
return await client.GetByteArrayAsync(url);
}
catch (HttpRequestException) when (attempt < maxRetries - 1)
{
await Task.Delay(TimeSpan.FromSeconds(Math.Pow(2, attempt)));
}
}
throw new NetworkException($"下载失败: {url}");
}
}