Skip to content

Commit 5568a5a

Browse files
author
邹嵩
committed
修改ReContent为虚方法, 让用户可以扩展读取的方法(编码问题等)
1 parent b796b0d commit 5568a5a

File tree

3 files changed

+44
-44
lines changed

3 files changed

+44
-44
lines changed

nuget/DotnetSpider.Core.nuspec

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<package xmlns="http://schemas.microsoft.com/packaging/2012/06/nuspec.xsd">
33
<metadata>
44
<id>DotnetSpider.Core</id>
5-
<version>2.4.9</version>
5+
<version>2.5.0</version>
66
<authors>[email protected];Walterwhatwater;xiaohuan0204</authors>
77
<owners>[email protected]</owners>
88
<iconUrl>https://github.com/zlzforever/DotnetSpider/blob/master/images/icon.png?raw=true</iconUrl>

nuget/DotnetSpider.Extension.nuspec

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<package xmlns="http://schemas.microsoft.com/packaging/2012/06/nuspec.xsd">
33
<metadata>
44
<id>DotnetSpider.Extension</id>
5-
<version>2.4.9</version>
5+
<version>2.5.0</version>
66
<authors>[email protected];Walterwhatwater;xiaohuan0204</authors>
77
<owners>[email protected]</owners>
88
<iconUrl>https://github.com/zlzforever/DotnetSpider/blob/master/images/icon.png?raw=true</iconUrl>

src/DotnetSpider.Core/Downloader/HttpClientDownloader.cs

+42-42
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,48 @@ protected override async Task<Page> DowloadContent(Request request, ISpider spid
181181
}
182182
}
183183

184+
protected virtual string ReadContent(Site site, HttpResponseMessage response)
185+
{
186+
byte[] contentBytes = response.Content.ReadAsByteArrayAsync().Result;
187+
contentBytes = PreventCutOff(contentBytes);
188+
if (string.IsNullOrWhiteSpace(site.EncodingName))
189+
{
190+
var charSet = response.Content.Headers.ContentType?.CharSet;
191+
Encoding htmlCharset = EncodingExtensions.GetEncoding(charSet, contentBytes);
192+
return htmlCharset.GetString(contentBytes, 0, contentBytes.Length);
193+
}
194+
else
195+
{
196+
return site.Encoding.GetString(contentBytes, 0, contentBytes.Length);
197+
}
198+
}
199+
200+
private Page HandleResponse(Request request, HttpResponseMessage response, Site site)
201+
{
202+
string content = ReadContent(site, response);
203+
204+
if (_decodeHtml)
205+
{
206+
#if NET45
207+
content = HttpUtility.UrlDecode(HttpUtility.HtmlDecode(content), string.IsNullOrEmpty(site.EncodingName) ? Encoding.Default : site.Encoding);
208+
#else
209+
content = System.Net.WebUtility.UrlDecode(System.Net.WebUtility.HtmlDecode(content));
210+
#endif
211+
}
212+
213+
Page page = new Page(request)
214+
{
215+
Content = content
216+
};
217+
218+
//foreach (var header in response.Headers)
219+
//{
220+
// page.Request.PutExtra(header.Key, header.Value);
221+
//}
222+
223+
return page;
224+
}
225+
184226
private void PrepareHttpClient(HttpClientEntry httpClientEntry)
185227
{
186228
httpClientEntry.Init(AllowAutoRedirect, () =>
@@ -278,48 +320,6 @@ private HttpRequestMessage GenerateHttpRequestMessage(Request request, Site site
278320
return httpRequestMessage;
279321
}
280322

281-
private Page HandleResponse(Request request, HttpResponseMessage response, Site site)
282-
{
283-
string content = ReadContent(site, response);
284-
285-
if (_decodeHtml)
286-
{
287-
#if NET45
288-
content = HttpUtility.UrlDecode(HttpUtility.HtmlDecode(content), string.IsNullOrEmpty(site.EncodingName) ? Encoding.Default : site.Encoding);
289-
#else
290-
content = System.Net.WebUtility.UrlDecode(System.Net.WebUtility.HtmlDecode(content));
291-
#endif
292-
}
293-
294-
Page page = new Page(request)
295-
{
296-
Content = content
297-
};
298-
299-
//foreach (var header in response.Headers)
300-
//{
301-
// page.Request.PutExtra(header.Key, header.Value);
302-
//}
303-
304-
return page;
305-
}
306-
307-
private string ReadContent(Site site, HttpResponseMessage response)
308-
{
309-
byte[] contentBytes = response.Content.ReadAsByteArrayAsync().Result;
310-
contentBytes = PreventCutOff(contentBytes);
311-
if (string.IsNullOrWhiteSpace(site.EncodingName))
312-
{
313-
var charSet = response.Content.Headers.ContentType?.CharSet;
314-
Encoding htmlCharset = EncodingExtensions.GetEncoding(charSet, contentBytes);
315-
return htmlCharset.GetString(contentBytes, 0, contentBytes.Length);
316-
}
317-
else
318-
{
319-
return site.Encoding.GetString(contentBytes, 0, contentBytes.Length);
320-
}
321-
}
322-
323323
private Page SaveFile(Request request, HttpResponseMessage response, ISpider spider)
324324
{
325325
var intervalPath = new Uri(request.Url).LocalPath.Replace("//", "/").Replace("/", Env.PathSeperator);

0 commit comments

Comments
 (0)