Skip to content

Commit 70dd94d

Browse files
author
邹嵩
committed
1. 实现不去重的QueueScheduler
2. Release 2.4.5
1 parent 33d1889 commit 70dd94d

File tree

20 files changed

+300
-167
lines changed

20 files changed

+300
-167
lines changed

nuget/DotnetSpider.Core.nuspec

+6-6
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<package xmlns="http://schemas.microsoft.com/packaging/2012/06/nuspec.xsd">
33
<metadata>
44
<id>DotnetSpider2.Core</id>
5-
<version>2.4.4</version>
5+
<version>2.4.5</version>
66
<authors>[email protected];Walterwhatwater;xiaohuan0204</authors>
77
<owners>[email protected]</owners>
88
<iconUrl>https://github.com/zlzforever/DotnetSpider/blob/master/images/icon.png?raw=true</iconUrl>
@@ -13,23 +13,23 @@
1313
<description>A .NET Standard web crawling library similar to WebMagic and Scrapy. It is a lightweight ,efficient and fast high-level web crawling &amp; scraping framework for .NET</description>
1414
<dependencies>
1515
<group targetFramework=".NETStandard2.0">
16-
<dependency id="Newtonsoft.Json" version="10.0.3"/>
16+
<dependency id="Newtonsoft.Json" version="11.0.2"/>
1717
<dependency id="NLog" version="5.0.0-beta09"/>
18-
<dependency id="HtmlAgilityPack" version="1.6.15"/>
18+
<dependency id="HtmlAgilityPack" version="1.7.2"/>
1919
<dependency id="System.Threading.Tasks.Parallel" version="4.3.0"/>
2020
<dependency id="System.Text.Encoding.CodePages" version="4.4.0"/>
2121
<dependency id="System.Runtime.InteropServices.RuntimeInformation" version="4.3.0"/>
2222
<dependency id="System.Diagnostics.Process" version="4.3.0"/>
2323
<dependency id="System.Configuration.ConfigurationManager" version="4.4.1"/>
24-
<dependency id="System.Data.SqlClient" version="4.4.2"/>
24+
<dependency id="System.Data.SqlClient" version="4.4.3"/>
2525
<dependency id="Microsoft.Extensions.DependencyModel" version="2.0.4"/>
2626
<dependency id="System.Runtime.Loader" version="4.3.0"/>
2727
<dependency id="System.Net.Ping" version="4.3.0"/>
2828
<dependency id="Polly" version="5.8.0" />
2929
</group>
3030
<group targetFramework=".NETFramework4.5">
31-
<dependency id="Newtonsoft.Json" version="10.0.3"/>
32-
<dependency id="HtmlAgilityPack" version="1.6.15"/>
31+
<dependency id="Newtonsoft.Json" version="11.0.2"/>
32+
<dependency id="HtmlAgilityPack" version="1.7.2"/>
3333
<dependency id="NLog" version="4.4.12"/>
3434
<dependency id="Polly" version="5.8.0" />
3535
</group>

nuget/DotnetSpider.Extension.nuspec

+11-11
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<package xmlns="http://schemas.microsoft.com/packaging/2012/06/nuspec.xsd">
33
<metadata>
44
<id>DotnetSpider2.Extension</id>
5-
<version>2.4.3</version>
5+
<version>2.4.5</version>
66
<authors>[email protected];Walterwhatwater;xiaohuan0204</authors>
77
<owners>[email protected]</owners>
88
<iconUrl>https://github.com/zlzforever/DotnetSpider/blob/master/images/icon.png?raw=true</iconUrl>
@@ -13,34 +13,34 @@
1313
<description>A .NET Standard web crawling library similar to WebMagic and Scrapy. It is a lightweight ,efficient and fast high-level web crawling &amp; scraping framework for .NET</description>
1414
<dependencies>
1515
<group targetFramework=".NETStandard2.0">
16-
<dependency id="DotnetSpider2.Core" version="2.4.3" />
16+
<dependency id="DotnetSpider2.Core" version="2.4.5" />
1717
<dependency id="Dapper" version="1.50.2"/>
18-
<dependency id="MailKit" version="2.0.1"/>
18+
<dependency id="MailKit" version="2.0.2"/>
1919
<dependency id="MongoDB.Driver" version="2.5.0"/>
2020
<dependency id="MySql.Data" version="6.10.6"/>
2121
<dependency id="StackExchange.Redis" version="1.2.6" />
2222
<dependency id="SSH.NET" version="2016.1.0" />
2323
<dependency id="System.Runtime.Extensions" version="4.3.0"/>
2424
<dependency id="EPPlus.Core" version="1.5.4"/>
25-
<dependency id="Selenium.WebDriver" version="3.8.0"/>
26-
<dependency id="Npgsql" version="3.2.6"/>
27-
<dependency id="CassandraCSharpDriver" version="3.4.0.1"/>
25+
<dependency id="Selenium.WebDriver" version="3.11.0"/>
26+
<dependency id="Npgsql" version="3.2.7"/>
27+
<dependency id="CassandraCSharpDriver" version="3.4.1"/>
2828
<dependency id="MessagePack" version="1.7.3.4"/>
2929
</group>
3030
<group targetFramework=".NETFramework4.5" >
31-
<dependency id="DotnetSpider2.Core" version="2.4.3" />
31+
<dependency id="DotnetSpider2.Core" version="2.4.5" />
3232
<dependency id="Dapper" version="1.50.2"/>
33-
<dependency id="MailKit" version="2.0.1"/>
33+
<dependency id="MailKit" version="2.0.2"/>
3434
<dependency id="MongoDB.Driver" version="2.5.0"/>
3535
<dependency id="MySql.Data" version="6.9.11"/>
3636
<dependency id="StackExchange.Redis" version="1.2.6" />
3737
<dependency id="FiddlerCore2" version="1.0.0"/>
3838
<dependency id="SSH.NET" version="2016.1.0" />
3939
<dependency id="DotRas.for.Win7" version="1.3.0" />
4040
<dependency id="EPPlus" version="4.1.1"/>
41-
<dependency id="Selenium.WebDriver" version="3.8.0"/>
42-
<dependency id="Npgsql" version="3.2.6"/>
43-
<dependency id="CassandraCSharpDriver" version="3.4.0.1"/>
41+
<dependency id="Selenium.WebDriver" version="3.11.0"/>
42+
<dependency id="Npgsql" version="3.2.7"/>
43+
<dependency id="CassandraCSharpDriver" version="3.4.1"/>
4444
<dependency id="MessagePack" version="1.7.3.4"/>
4545
</group>
4646
</dependencies>

src/DotnetSpider.Core.Test/Processor/ProcessorTest.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ public void ProcesserException()
3434
new TestPageProcessor())
3535
// save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd
3636
.AddPipeline(new FilePipeline());
37-
37+
spider.ClearSchedulerAfterComplete = false;
3838
// dowload html by http client
3939
spider.Downloader = new HttpClientDownloader();
4040

src/DotnetSpider.Core.Test/SpiderTest.cs

+2-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ public void CloseSignal()
130130
{
131131
Spider spider = Spider.Create(new Site { CycleRetryTimes = 5, EncodingName = "UTF-8" },
132132
new TestPageProcessor()).AddPipeline(new TestPipeline());
133-
133+
spider.ClearSchedulerAfterComplete = false;
134134
for (int i = 0; i < 20; ++i)
135135
{
136136
spider.AddStartUrl($"http://www.baidu.com/_t={i}");
@@ -143,6 +143,7 @@ public void CloseSignal()
143143

144144
Spider spider2 = Spider.Create(new Site { CycleRetryTimes = 5, EncodingName = "UTF-8" },
145145
new TestPageProcessor()).AddPipeline(new TestPipeline());
146+
spider2.ClearSchedulerAfterComplete = false;
146147
for (int i = 0; i < 25; ++i)
147148
{
148149
spider2.AddStartUrl($"http://www.baidu.com/_t={i}");

src/DotnetSpider.Core/DotnetSpider.Core.projitems

+2
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@
118118
<Compile Include="$(MSBuildThisFileDirectory)Redial\RedialResult.cs" />
119119
<Compile Include="$(MSBuildThisFileDirectory)Request.cs" />
120120
<Compile Include="$(MSBuildThisFileDirectory)ResultItems.cs" />
121+
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\BaseScheduler.cs" />
121122
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\Component\BloomFilterDuplicateRemover.cs" />
122123
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\Component\HashSetDuplicateRemover.cs" />
123124
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\Component\IDuplicateRemover.cs" />
@@ -126,6 +127,7 @@
126127
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\IScheduler.cs" />
127128
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\PriorityScheduler.cs" />
128129
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\QueueDuplicateRemovedScheduler.cs" />
130+
<Compile Include="$(MSBuildThisFileDirectory)Scheduler\QueueScheduler.cs" />
129131
<Compile Include="$(MSBuildThisFileDirectory)Selector\HtmlSelector.cs" />
130132
<Compile Include="$(MSBuildThisFileDirectory)Selector\AbstractSelectable.cs" />
131133
<Compile Include="$(MSBuildThisFileDirectory)Selector\CssSelector.cs" />
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
using DotnetSpider.Core.Redial;
2+
using System;
3+
using System.Collections.Generic;
4+
5+
namespace DotnetSpider.Core.Scheduler
6+
{
7+
public abstract class BaseScheduler : Named, IScheduler, IDisposable
8+
{
9+
/// <summary>
10+
/// 爬虫对象
11+
/// </summary>
12+
protected ISpider Spider { get; set; }
13+
14+
/// <summary>
15+
/// 采集成功的链接数加 1
16+
/// </summary>
17+
public abstract void IncreaseSuccessCount();
18+
19+
/// <summary>
20+
/// 采集失败的次数加 1
21+
/// </summary>
22+
public abstract void IncreaseErrorCount();
23+
24+
/// <summary>
25+
/// 批量导入
26+
/// </summary>
27+
/// <param name="requests">请求对象</param>
28+
public abstract void Import(IEnumerable<Request> requests);
29+
30+
/// <summary>
31+
/// 是否会使用互联网
32+
/// </summary>
33+
protected abstract bool UseInternet { get; set; }
34+
35+
/// <summary>
36+
/// 剩余链接数
37+
/// </summary>
38+
public abstract long LeftRequestsCount { get; }
39+
40+
/// <summary>
41+
/// 总的链接数
42+
/// </summary>
43+
public virtual long TotalRequestsCount { get; }
44+
45+
/// <summary>
46+
/// 采集成功的链接数
47+
/// </summary>
48+
public abstract long SuccessRequestsCount { get; }
49+
50+
/// <summary>
51+
/// 采集失败的次数, 不是链接数, 如果一个链接采集多次都失败会记录多次
52+
/// </summary>
53+
public abstract long ErrorRequestsCount { get; }
54+
55+
/// <summary>
56+
/// 是否深度优先
57+
/// </summary>
58+
public bool DepthFirst { get; set; } = true;
59+
60+
/// <summary>
61+
/// 添加请求对象到队列
62+
/// </summary>
63+
/// <param name="request">请求对象</param>
64+
public void Push(Request request)
65+
{
66+
if (UseInternet)
67+
{
68+
NetworkCenter.Current.Execute("sch-push", () =>
69+
{
70+
DoPush(request);
71+
});
72+
}
73+
else
74+
{
75+
DoPush(request);
76+
}
77+
}
78+
79+
/// <summary>
80+
/// 初始化队列
81+
/// </summary>
82+
/// <param name="spider">爬虫对象</param>
83+
public virtual void Init(ISpider spider)
84+
{
85+
if (Spider == null)
86+
{
87+
Spider = spider;
88+
}
89+
else
90+
{
91+
throw new SpiderException("Scheduler already init");
92+
}
93+
}
94+
95+
/// <summary>
96+
/// 取得一个需要处理的请求对象
97+
/// </summary>
98+
/// <returns>请求对象</returns>
99+
public abstract Request Poll();
100+
101+
/// <summary>
102+
/// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources.
103+
/// </summary>
104+
public abstract void Dispose();
105+
106+
/// <summary>
107+
/// 导出整个队列
108+
/// </summary>
109+
public virtual void Export()
110+
{
111+
}
112+
113+
protected virtual bool ShouldReserved(Request request)
114+
{
115+
return request.CycleTriedTimes > 0 && request.CycleTriedTimes <= Spider.Site.CycleRetryTimes;
116+
}
117+
118+
protected abstract void DoPush(Request request);
119+
}
120+
}

0 commit comments

Comments
 (0)