6
6
[ ![ NuGet] ( https://img.shields.io/nuget/vpre/DotnetSpider.svg )] ( https://www.nuget.org/packages/DotnetSpider )
7
7
[ ![ Member project of .NET Core Community] ( https://img.shields.io/badge/member%20project%20of-NCC-9e20c9.svg )] ( https://github.com/dotnetcore )
8
8
[ ![ GitHub license] ( https://img.shields.io/github/license/dotnetcore/DotnetSpider.svg )] ( https://github.com/dotnetcore/DotnetSpider/blob/master/LICENSE.txt )
9
+ [ ![ FOSSA Status] ( https://app.fossa.com/api/projects/git%2Bgithub.com%2Fdotnetcore%2FDotnetSpider.svg?type=shield )] ( https://app.fossa.com/projects/git%2Bgithub.com%2Fdotnetcore%2FDotnetSpider?ref=badge_shield )
9
10
10
11
DotnetSpider, a .NET Standard web crawling library. It is lightweight, efficient and fast high-level web crawling & scraping framework.
11
12
12
- If you want get latest beta packages, you should add the myget feed:
13
+ If you want get latest beta packages, you should add the myget feed:
13
14
14
15
```
15
16
<add key="myget.org" value="https://www.myget.org/F/zlzforever/api/v3/index.json" protocolVersion="3" />
@@ -43,21 +44,21 @@ If you want get latest beta packages, you should add the myget feed:
43
44
9 . MongoDb (option)
44
45
45
46
docker run --name mongo -d -p 27017:27017 --restart always mongo
46
-
47
+
47
48
10 . RabbitMQ
48
49
49
50
docker run -d --restart always --name rabbimq -p 4369:4369 -p 5671-5672:5671-5672 -p 25672:25672 -p 15671-15672:15671-15672 \
50
51
-e RABBITMQ_DEFAULT_USER=user -e RABBITMQ_DEFAULT_PASS=password \
51
52
rabbitmq:3-management
52
-
53
+
53
54
11 . Docker remote api for mac
54
55
55
56
docker run -d --restart always --name socat -v /var/run/docker.sock:/var/run/docker.sock -p 2376:2375 bobrik/socat TCP4-LISTEN:2375,fork,reuseaddr UNIX-CONNECT:/var/run/docker.sock
56
57
57
58
12 . HBase
58
59
59
- docker run -d --restart always --name hbase -p 20550:8080 -p 8085:8085 -p 9090:9090 -p 9095:9095 -p 16010:16010 dajobe/hbase
60
-
60
+ docker run -d --restart always --name hbase -p 20550:8080 -p 8085:8085 -p 9090:9090 -p 9095:9095 -p 16010:16010 dajobe/hbase
61
+
61
62
### MORE DOCUMENTS
62
63
63
64
https://github.com/dotnetcore/DotnetSpider/wiki
@@ -83,12 +84,12 @@ https://github.com/dotnetcore/DotnetSpider/wiki
83
84
builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>();
84
85
await builder.Build().RunAsync();
85
86
}
86
-
87
+
87
88
public EntitySpider(IOptions<SpiderOptions> options, SpiderServices services, ILogger<Spider> logger) : base(
88
89
options, services, logger)
89
90
{
90
91
}
91
-
92
+
92
93
protected override async Task InitializeAsync(CancellationToken stoppingToken)
93
94
{
94
95
AddDataFlow(new DataParser<CnblogsEntry>());
@@ -97,12 +98,12 @@ https://github.com/dotnetcore/DotnetSpider/wiki
97
98
new Request("https://news.cnblogs.com/n/page/1/", new Dictionary<string, string> {{"网站", "博客园"}}),
98
99
new Request("https://news.cnblogs.com/n/page/2/", new Dictionary<string, string> {{"网站", "博客园"}}));
99
100
}
100
-
101
+
101
102
protected override (string Id, string Name) GetIdAndName()
102
103
{
103
104
return (Guid.NewGuid().ToString(), "博客园");
104
105
}
105
-
106
+
106
107
[Schema("cnblogs", "news")]
107
108
[EntitySelector(Expression = ".//div[@class='news_block']", Type = SelectorType.XPath)]
108
109
[GlobalValueSelector(Expression = ".//a[@class='current']", Name = "类别", Type = SelectorType.XPath)]
@@ -114,37 +115,37 @@ https://github.com/dotnetcore/DotnetSpider/wiki
114
115
HasIndex(x => x.Title);
115
116
HasIndex(x => new {x.WebSite, x.Guid}, true);
116
117
}
117
-
118
+
118
119
public int Id { get; set; }
119
-
120
+
120
121
[Required]
121
122
[StringLength(200)]
122
123
[ValueSelector(Expression = "类别", Type = SelectorType.Environment)]
123
124
public string Category { get; set; }
124
-
125
+
125
126
[Required]
126
127
[StringLength(200)]
127
128
[ValueSelector(Expression = "网站", Type = SelectorType.Environment)]
128
129
public string WebSite { get; set; }
129
-
130
+
130
131
[StringLength(200)]
131
132
[ValueSelector(Expression = "//title")]
132
133
[ReplaceFormatter(NewValue = "", OldValue = " - 博客园")]
133
134
public string Title { get; set; }
134
-
135
+
135
136
[StringLength(40)]
136
137
[ValueSelector(Expression = "GUID", Type = SelectorType.Environment)]
137
138
public string Guid { get; set; }
138
-
139
+
139
140
[ValueSelector(Expression = ".//h2[@class='news_entry']/a")]
140
141
public string News { get; set; }
141
-
142
+
142
143
[ValueSelector(Expression = ".//h2[@class='news_entry']/a/@href")]
143
144
public string Url { get; set; }
144
-
145
+
145
146
[ValueSelector(Expression = ".//div[@class='entry_summary']")]
146
147
public string PlainText { get; set; }
147
-
148
+
148
149
[ValueSelector(Expression = "DATETIME", Type = SelectorType.Environment)]
149
150
public DateTime CreationTime { get; set; }
150
151
}
0 commit comments