Skip to content

Commit b796b0d

Browse files
author
邹嵩
committed
修复300跳转问题
1 parent 1e76470 commit b796b0d

File tree

3 files changed

+118
-20
lines changed

3 files changed

+118
-20
lines changed

src/DotnetSpider.Core.Test/Downloader/HttpClientDownloaderTest.cs

+20
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,26 @@ public void _404Url()
9191
Assert.Equal(5, spider.RetriedTimes.Value);
9292
}
9393

94+
[Fact(DisplayName = "_301Url")]
95+
public void _301Url()
96+
{
97+
if (!Env.IsWindows)
98+
{
99+
return;
100+
}
101+
var spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 },
102+
"abcd",
103+
new QueueDuplicateRemovedScheduler(),
104+
new TestPageProcessor());
105+
spider.AddPipeline(new ConsolePipeline());
106+
spider.SkipTargetUrlsWhenResultIsEmpty = true;
107+
spider.Downloader = new HttpClientDownloader();
108+
spider.EmptySleepTime = 6000;
109+
spider.AddStartUrl("https://tieba.baidu.com/f?kw=%E7%AE%80%E9%98%B3&ie=utf-8&pn=50");
110+
spider.Run();
111+
Assert.Equal(0, spider.RetriedTimes.Value);
112+
}
113+
94114
class HttpClientDownloader2 : HttpClientDownloader
95115
{
96116
protected override Task<Page> DowloadContent(Request request, ISpider spider)

src/DotnetSpider.Core/Downloader/HttpClientDownloader.cs

+8-5
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ public class HttpClientDownloader : BaseDownloader
5252
private readonly bool _decodeHtml;
5353
private readonly double _timeout = 8000;
5454

55+
public bool AllowAutoRedirect { get; set; } = true;
56+
5557
/// <summary>
5658
/// A <see cref="HttpClient"/> pool
5759
/// </summary>
@@ -106,7 +108,7 @@ protected override void AddCookieToDownloadClient(Cookie cookie)
106108
/// <param name="request">请求信息 <see cref="Request"/></param>
107109
/// <param name="spider">爬虫 <see cref="ISpider"/></param>
108110
/// <returns>页面数据 <see cref="Page"/></returns>
109-
protected override Task<Page> DowloadContent(Request request, ISpider spider)
111+
protected override async Task<Page> DowloadContent(Request request, ISpider spider)
110112
{
111113
HttpResponseMessage response = null;
112114
try
@@ -140,7 +142,7 @@ protected override Task<Page> DowloadContent(Request request, ISpider spider)
140142
if (!spider.Site.DownloadFiles)
141143
{
142144
Logger.Log(spider.Identity, $"Ignore: {request.Url} because media type is not allowed to download.", Level.Warn);
143-
return Task.FromResult(new Page(request) { Skip = true });
145+
return await Task.FromResult(new Page(request) { Skip = true });
144146
}
145147
else
146148
{
@@ -159,11 +161,12 @@ protected override Task<Page> DowloadContent(Request request, ISpider spider)
159161

160162
page.TargetUrl = response.RequestMessage.RequestUri.AbsoluteUri;
161163

162-
return Task.FromResult(page);
164+
return await Task.FromResult(page);
163165
}
164166
catch (Exception e)
165167
{
166-
return Task.FromResult(CreateRetryPage(e, request, spider));
168+
var page = CreateRetryPage(e, request, spider);
169+
return await Task.FromResult(page);
167170
}
168171
finally
169172
{
@@ -180,7 +183,7 @@ protected override Task<Page> DowloadContent(Request request, ISpider spider)
180183

181184
private void PrepareHttpClient(HttpClientEntry httpClientEntry)
182185
{
183-
httpClientEntry.Init(() =>
186+
httpClientEntry.Init(AllowAutoRedirect, () =>
184187
{
185188
if (!Equals(httpClientEntry.Client.Timeout.TotalSeconds, _timeout))
186189
{

src/DotnetSpider.Core/Downloader/IHttpClientPool.cs

+90-15
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
using System.Net;
33
using System.Net.Http;
44
using System.Runtime.CompilerServices;
5+
using System.Threading;
6+
using System.Threading.Tasks;
57

68
namespace DotnetSpider.Core.Downloader
79
{
@@ -14,20 +16,6 @@ public class HttpClientEntry
1416

1517
internal HttpClientHandler Handler { get; private set; }
1618

17-
public HttpClientEntry()
18-
{
19-
Handler = new HttpClientHandler
20-
{
21-
AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip,
22-
UseProxy = true,
23-
UseCookies = true,
24-
AllowAutoRedirect = true,
25-
MaxAutomaticRedirections = 10
26-
};
27-
Client = new HttpClient(Handler);
28-
ActiveTime = DateTime.Now;
29-
}
30-
3119
internal CookieContainer CookieContainer
3220
{
3321
set
@@ -40,17 +28,104 @@ internal CookieContainer CookieContainer
4028
}
4129

4230
[MethodImpl(MethodImplOptions.Synchronized)]
43-
internal void Init(Action configAction, Func<CookieContainer> cookieContainerFactory)
31+
internal void Init(bool allowAutoRedirect, Action configAction, Func<CookieContainer> cookieContainerFactory)
4432
{
4533
if (_inited)
4634
{
4735
return;
4836
}
4937

38+
Handler = new HttpClientHandler
39+
{
40+
AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip,
41+
UseProxy = true,
42+
UseCookies = true,
43+
AllowAutoRedirect = true,
44+
MaxAutomaticRedirections = 10
45+
};
46+
Client = allowAutoRedirect ? new HttpClient(new GlobalRedirectHandler(Handler)) : new HttpClient(Handler);
47+
ActiveTime = DateTime.Now;
48+
5049
configAction();
50+
5151
Handler.CookieContainer = cookieContainerFactory();
52+
5253
_inited = true;
5354
}
55+
56+
public class GlobalRedirectHandler : DelegatingHandler
57+
{
58+
public GlobalRedirectHandler(HttpMessageHandler innerHandler)
59+
{
60+
InnerHandler = innerHandler;
61+
}
62+
63+
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
64+
{
65+
var tcs = new TaskCompletionSource<HttpResponseMessage>();
66+
67+
base.SendAsync(request, cancellationToken)
68+
.ContinueWith(t =>
69+
{
70+
HttpResponseMessage response;
71+
try
72+
{
73+
response = t.Result;
74+
}
75+
catch (Exception e)
76+
{
77+
response = new HttpResponseMessage(HttpStatusCode.ServiceUnavailable) { ReasonPhrase = e.Message };
78+
}
79+
if (response.StatusCode == HttpStatusCode.MovedPermanently
80+
|| response.StatusCode == HttpStatusCode.Moved
81+
|| response.StatusCode == HttpStatusCode.Redirect
82+
|| response.StatusCode == HttpStatusCode.Found
83+
|| response.StatusCode == HttpStatusCode.SeeOther
84+
|| response.StatusCode == HttpStatusCode.RedirectKeepVerb
85+
|| response.StatusCode == HttpStatusCode.TemporaryRedirect
86+
|| (int)response.StatusCode == 308)
87+
{
88+
89+
var newRequest = CopyRequest(response.RequestMessage);
90+
91+
if (response.StatusCode == HttpStatusCode.Redirect
92+
|| response.StatusCode == HttpStatusCode.Found
93+
|| response.StatusCode == HttpStatusCode.SeeOther)
94+
{
95+
newRequest.Content = null;
96+
newRequest.Method = HttpMethod.Get;
97+
98+
}
99+
newRequest.RequestUri = response.Headers.Location;
100+
101+
base.SendAsync(newRequest, cancellationToken)
102+
.ContinueWith(t2 => tcs.SetResult(t2.Result), cancellationToken);
103+
}
104+
else
105+
{
106+
tcs.SetResult(response);
107+
}
108+
}, cancellationToken);
109+
110+
return tcs.Task;
111+
}
112+
113+
private static HttpRequestMessage CopyRequest(HttpRequestMessage oldRequest)
114+
{
115+
var newrequest = new HttpRequestMessage(oldRequest.Method, oldRequest.RequestUri);
116+
117+
foreach (var header in oldRequest.Headers)
118+
{
119+
newrequest.Headers.TryAddWithoutValidation(header.Key, header.Value);
120+
}
121+
foreach (var property in oldRequest.Properties)
122+
{
123+
newrequest.Properties.Add(property);
124+
}
125+
if (oldRequest.Content != null) newrequest.Content = new StreamContent(oldRequest.Content.ReadAsStreamAsync().Result);
126+
return newrequest;
127+
}
128+
}
54129
}
55130

56131
/// <summary>

0 commit comments

Comments
 (0)