Skip to content

Commit 254aaf5

Browse files
author
邹嵩
committed
修复一个队列重复的问题
开始实现完全分布式框架
1 parent 54b82f5 commit 254aaf5

26 files changed

+246
-18
lines changed

DotnetSpider.sln

+12
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
4040
runtests.sh = runtests.sh
4141
EndProjectSection
4242
EndProject
43+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DotnetSpider.Node", "src\DotnetSpider.Node\DotnetSpider.Node.csproj", "{C2BAD1A6-6744-4927-B014-67647D3FAD58}"
44+
EndProject
45+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DotnetSpider.Broker", "src\DotnetSpider.Broker\DotnetSpider.Broker.csproj", "{93099A1A-128B-4023-9271-F535A11F2490}"
46+
EndProject
4347
Global
4448
GlobalSection(SolutionConfigurationPlatforms) = preSolution
4549
Debug|Any CPU = Debug|Any CPU
@@ -98,6 +102,14 @@ Global
98102
{372C7A6F-E1EB-4AA6-8B31-1AE52FBDAA83}.Debug|Any CPU.Build.0 = Debug|Any CPU
99103
{372C7A6F-E1EB-4AA6-8B31-1AE52FBDAA83}.Release|Any CPU.ActiveCfg = Release|Any CPU
100104
{372C7A6F-E1EB-4AA6-8B31-1AE52FBDAA83}.Release|Any CPU.Build.0 = Release|Any CPU
105+
{C2BAD1A6-6744-4927-B014-67647D3FAD58}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
106+
{C2BAD1A6-6744-4927-B014-67647D3FAD58}.Debug|Any CPU.Build.0 = Debug|Any CPU
107+
{C2BAD1A6-6744-4927-B014-67647D3FAD58}.Release|Any CPU.ActiveCfg = Release|Any CPU
108+
{C2BAD1A6-6744-4927-B014-67647D3FAD58}.Release|Any CPU.Build.0 = Release|Any CPU
109+
{93099A1A-128B-4023-9271-F535A11F2490}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
110+
{93099A1A-128B-4023-9271-F535A11F2490}.Debug|Any CPU.Build.0 = Debug|Any CPU
111+
{93099A1A-128B-4023-9271-F535A11F2490}.Release|Any CPU.ActiveCfg = Release|Any CPU
112+
{93099A1A-128B-4023-9271-F535A11F2490}.Release|Any CPU.Build.0 = Release|Any CPU
101113
EndGlobalSection
102114
GlobalSection(SolutionProperties) = preSolution
103115
HideSolutionNode = FALSE
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp2.1</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<PackageReference Include="Confluent.Kafka" Version="0.11.5" />
10+
</ItemGroup>
11+
12+
</Project>

src/DotnetSpider.Broker/Program.cs

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
using Confluent.Kafka;
2+
using Confluent.Kafka.Serialization;
3+
using System;
4+
using System.Collections.Generic;
5+
using System.Diagnostics;
6+
using System.Text;
7+
8+
namespace DotnetSpider.Broker
9+
{
10+
class Program
11+
{
12+
static void Main(string[] args)
13+
{
14+
var config = new Dictionary<string, object>
15+
{
16+
{ "bootstrap.servers", "192.168.90.106:9092" }
17+
};
18+
19+
using (var producer = new Producer<Null, string>(config, null, new StringSerializer(Encoding.UTF8)))
20+
{
21+
for (int i = 0; i < 1000; ++i)
22+
{
23+
var dr = producer.ProduceAsync("my-topic", null, "test message text").Result;
24+
Console.WriteLine($"Delivered '{dr.Value}' to: {dr.TopicPartitionOffset}");
25+
}
26+
}
27+
Console.Read();
28+
}
29+
}
30+
}

src/DotnetSpider.Core/DotnetSpider.Core.csproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<TargetFrameworks>net40;net45;netstandard2.0</TargetFrameworks>
44
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
55
<PackageRequireLicenseAcceptance>true</PackageRequireLicenseAcceptance>
6-
<Version>3.0.1</Version>
6+
<Version>3.0.2</Version>
77
<Authors>[email protected];</Authors>
88
<AssemblyName>DotnetSpider.Core</AssemblyName>
99
<Copyright>Copyright 2018 Lewis Zou</Copyright>

src/DotnetSpider.Core/Pipeline/BasePipeline.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ namespace DotnetSpider.Core.Pipeline
66
/// <summary>
77
/// 数据管道抽象, 通过数据管道把解析的数据存到不同的存储中(文件、数据库)
88
/// </summary>
9-
public abstract class BasePipeline : IPipeline
9+
public abstract class BasePipeline : Named, IPipeline
1010
{
1111
/// <summary>
1212
/// 处理页面解析器解析到的数据结果

src/DotnetSpider.Core/Spider.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ protected void VerifyDataOrGenerateReport(string[] arguments)
144144
public Site Site
145145
{
146146
get => _site;
147-
protected set { _site = value ?? throw new ArgumentException($"{nameof(Site)} should not be null."); }
147+
set { _site = value ?? throw new ArgumentException($"{nameof(Site)} should not be null."); }
148148
}
149149

150150
/// <summary>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
using DotnetSpider.Core;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Text;
6+
7+
namespace DotnetSpider.Extension
8+
{
9+
public class ConfigurableSpider : Spider
10+
{
11+
private readonly string _json;
12+
13+
public ConfigurableSpider(string json)
14+
{
15+
_json = json;
16+
}
17+
18+
protected override void OnInit(params string[] arguments)
19+
{
20+
21+
}
22+
}
23+
}

src/DotnetSpider.Extension/DotnetSpider.Extension.csproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<TargetFrameworks>net40;net45;netstandard2.0</TargetFrameworks>
44
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
55
<PackageRequireLicenseAcceptance>true</PackageRequireLicenseAcceptance>
6-
<Version>3.0.1</Version>
6+
<Version>3.0.2</Version>
77
<Authors>[email protected];</Authors>
88
<AssemblyName>DotnetSpider.Extension</AssemblyName>
99
<Copyright>Copyright 2018 Lewis Zou</Copyright>

src/DotnetSpider.Extension/Pipeline/DbModelPipeline.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ public abstract class DbModelPipeline : ModelPipeline
2222

2323
public int RetryTimes { get; set; } = 600;
2424

25-
public string ConnectString { get; private set; }
25+
public string ConnectString { get; set; }
2626

2727
/// <summary>
2828
/// 数据库忽略大小写
+88
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
{
2+
"Model": {
3+
"Selector": {
4+
"Type": "XPath",
5+
"Expression": "//div[@class='yk-pack pack-film']",
6+
"Arguments": null
7+
},
8+
"Take": 0,
9+
"TakeFromHead": true,
10+
"Table": {
11+
"Database": "youku",
12+
"Name": "show",
13+
"Postfix": "Today",
14+
"UpdateColumns": null,
15+
"Indexs": null,
16+
"Uniques": null,
17+
"FullName": "show_2018_07_25"
18+
},
19+
"Fields": [
20+
{
21+
"NotNull": false,
22+
"Option": "None",
23+
"Length": 255,
24+
"Name": "name",
25+
"IgnoreStore": false,
26+
"DataType": "String",
27+
"IsPrimary": false,
28+
"Formatters": null,
29+
"Type": "XPath",
30+
"Expression": ".//img[@class='quic']/@alt",
31+
"Arguments": null
32+
},
33+
{
34+
"NotNull": false,
35+
"Option": "None",
36+
"Length": 255,
37+
"Name": "index",
38+
"IgnoreStore": false,
39+
"DataType": "Int",
40+
"IsPrimary": false,
41+
"Formatters": null,
42+
"Type": "Enviroment",
43+
"Expression": "index",
44+
"Arguments": null
45+
},
46+
{
47+
"NotNull": false,
48+
"Option": "None",
49+
"Length": 255,
50+
"Name": "id",
51+
"IgnoreStore": false,
52+
"DataType": "Int",
53+
"IsPrimary": true,
54+
"Formatters": null,
55+
"Type": "Enviroment",
56+
"Expression": "",
57+
"Arguments": null
58+
}
59+
],
60+
"TargetRequestSelectors": [
61+
{
62+
"XPaths": [ "//ul[@class='yk-pages']" ],
63+
"Patterns": [ "(http|ftp|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&amp;:/~\\+#]*[\\w\\-\\@?^=%&amp;/~\\+#])?" ]
64+
}
65+
],
66+
"SharedValueSelectors": null
67+
},
68+
"Scheduler": {
69+
"Name": "QueueDuplicateRemovedScheduler"
70+
},
71+
"Downloader": {
72+
"Name": "HttpClientDownloader",
73+
"AllowAutoRedirect": true
74+
},
75+
"Pipeline": {
76+
"Name": "MySqlEntityPipeline",
77+
"ConnectString": "Database='mysql';Data Source=localhost;password=;User ID=root;Port=3306;SslMode=None"
78+
},
79+
"ClearSchedulerAfterCompleted": true,
80+
"StatusFlushInterval": 5000,
81+
"PipelineRetryTimes": 2,
82+
"PipelineCachedSize": 5,
83+
"RedialExecutor": "MutexRedialExecutor",
84+
"EmptySleepTime": 15000,
85+
"ExitWhenComplete": true,
86+
"ThreadNum": 1,
87+
"SkipTargetRequestsWhenResultIsEmpty": true
88+
}

src/DotnetSpider.Extraction/DotnetSpider.Extraction.csproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<TargetFrameworks>net40;net45;netstandard2.0</TargetFrameworks>
44
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
55
<PackageRequireLicenseAcceptance>true</PackageRequireLicenseAcceptance>
6-
<Version>3.0.0</Version>
6+
<Version>3.0.2</Version>
77
<Authors>[email protected];</Authors>
88
<AssemblyName>DotnetSpider.Extraction</AssemblyName>
99
<Copyright>Copyright 2018 Lewis Zou</Copyright>

src/DotnetSpider.Extraction/Model/Attribute/FieldSelector.cs

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System;
1+
using Newtonsoft.Json;
2+
using System;
23

34
namespace DotnetSpider.Extraction.Model.Attribute
45
{

src/DotnetSpider.Extraction/Model/Attribute/SharedValueSelector.cs

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System;
1+
using Newtonsoft.Json;
2+
using System;
23

34
namespace DotnetSpider.Extraction.Model.Attribute
45
{

src/DotnetSpider.Extraction/Model/Attribute/TableInfo.cs

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System;
1+
using Newtonsoft.Json;
2+
using System;
23

34
namespace DotnetSpider.Extraction.Model.Attribute
45
{
@@ -10,6 +11,9 @@ public class TableInfo : System.Attribute
1011
{
1112
private string _name;
1213

14+
[JsonIgnore]
15+
public override object TypeId => base.TypeId;
16+
1317
/// <summary>
1418
/// 数据库名
1519
/// </summary>

src/DotnetSpider.Extraction/Model/Attribute/TargetRequestSelector.cs

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System;
1+
using Newtonsoft.Json;
2+
using System;
23

34
namespace DotnetSpider.Extraction.Model.Attribute
45
{
@@ -8,6 +9,9 @@ namespace DotnetSpider.Extraction.Model.Attribute
89
[AttributeUsage(AttributeTargets.Class, AllowMultiple = true)]
910
public class TargetRequestSelector : System.Attribute
1011
{
12+
[JsonIgnore]
13+
public override object TypeId => base.TypeId;
14+
1115
public TargetRequestSelector() { }
1216

1317
public TargetRequestSelector(string[] xpaths, string[] patterns = null)

src/DotnetSpider.Extraction/Model/Attribute/ToNext.cs

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System;
1+
using Newtonsoft.Json;
2+
using System;
23

34
namespace DotnetSpider.Extraction.Model.Attribute
45
{
@@ -8,6 +9,9 @@ namespace DotnetSpider.Extraction.Model.Attribute
89
[AttributeUsage(AttributeTargets.Property, AllowMultiple = true)]
910
public class ToNext : System.Attribute
1011
{
12+
[JsonIgnore]
13+
public override object TypeId => base.TypeId;
14+
1115
/// <summary>
1216
/// 保存到起始链接的额外信息
1317
/// </summary>

src/DotnetSpider.Extraction/Model/DataType.cs

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1-
using System;
1+
using Newtonsoft.Json;
2+
using Newtonsoft.Json.Converters;
3+
using System;
24
using System.Collections.Generic;
35
using System.Linq;
46
using System.Text;
57

68
namespace DotnetSpider.Extraction.Model
79
{
10+
[JsonConverter(typeof(StringEnumConverter))]
811
public enum DataType
912
{
1013
None,

src/DotnetSpider.Extraction/Model/FieldOptions.cs

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
using System;
1+
using Newtonsoft.Json;
2+
using Newtonsoft.Json.Converters;
3+
using System;
24
using System.Collections.Generic;
35
using System.Linq;
46
using System.Text;
@@ -8,6 +10,7 @@ namespace DotnetSpider.Extraction.Model
810
/// <summary>
911
/// 额外选项的定义
1012
/// </summary>
13+
[JsonConverter(typeof(StringEnumConverter))]
1114
public enum FieldOptions
1215
{
1316
/// <summary>

src/DotnetSpider.Extraction/Model/ModelDefinition.cs

+2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using DotnetSpider.Extraction.Model.Attribute;
2+
using Newtonsoft.Json;
23
using System;
34
using System.Collections.Generic;
45
using System.Linq;
@@ -43,6 +44,7 @@ public class ModelDefinition : IModel
4344
/// </summary>
4445
public IEnumerable<SharedValueSelector> SharedValueSelectors { get; protected set; }
4546

47+
[JsonIgnore]
4648
public string Identity { get; protected set; }
4749

4850
public ModelDefinition(Selector selector, IEnumerable<FieldSelector> fields, TableInfo table,

src/DotnetSpider.Extraction/Model/Selector.cs

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
1-
namespace DotnetSpider.Extraction.Model
1+
using Newtonsoft.Json;
2+
using Newtonsoft.Json.Converters;
3+
4+
namespace DotnetSpider.Extraction.Model
25
{
36
/// <summary>
47
/// 选择器特性
58
/// </summary>
69
public class Selector : System.Attribute
710
{
11+
[JsonIgnore]
12+
public override object TypeId => base.TypeId;
13+
814
/// <summary>
915
/// 构造方法
1016
/// </summary>

src/DotnetSpider.Extraction/Model/TableNamePostfix.cs

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1-
namespace DotnetSpider.Extraction.Model
1+
using Newtonsoft.Json;
2+
using Newtonsoft.Json.Converters;
3+
4+
namespace DotnetSpider.Extraction.Model
25
{
6+
[JsonConverter(typeof(StringEnumConverter))]
37
public enum TableNamePostfix
48
{
59
None,

src/DotnetSpider.Extraction/SelectorType.cs

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1-
using System;
1+
using Newtonsoft.Json;
2+
using Newtonsoft.Json.Converters;
3+
using System;
24

35
namespace DotnetSpider.Extraction
46
{
57
/// <summary>
68
/// 查询器类型
79
/// </summary>
810
[Flags]
11+
[JsonConverter(typeof(StringEnumConverter))]
912
public enum SelectorType
1013
{
1114
/// <summary>

0 commit comments

Comments
 (0)