Skip to content

Commit 2c5bbdd

Browse files
committed
重构了BasePageProcessor.
1 parent e17955d commit 2c5bbdd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+452
-786
lines changed

src/DotnetSpider.Core.Test/TargetRequestExtractorTest.cs

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
using System.Linq;
22
using System.Net.Http;
3-
using DotnetSpider.Core.Processor.TargetRequestExtractors;
43
using Xunit;
54
using DotnetSpider.Downloader;
5+
using DotnetSpider.Core.Processor.RequestExtractor;
66

77
namespace DotnetSpider.Core.Test
88
{
@@ -14,12 +14,13 @@ public void RegionAndPatternTargetUrlsExtractor()
1414
HttpClient client = new HttpClient();
1515
var html = client.GetStringAsync("http://www.cnblogs.com").Result;
1616

17-
var extracotr = new RegionAndPatternTargetRequestExtractor(".//div[@class='pager']", "/sitehome/p/\\d+", "^http://www\\.cnblogs\\.com/$");
17+
var extracotr = new XPathRequestExtractor(".//div[@class='pager']");
18+
//, "/sitehome/p/\\d+", "^http://www\\.cnblogs\\.com/$"
1819
var page = new Page(new Request("http://cnblogs.com"));
1920
page.Content = html;
2021
page.ContentType = ContentType.Html;
21-
var requets = Enumerable.ToList(extracotr.ExtractRequests(page));
22-
Assert.Equal(11, requets.Count);
22+
var requets = Enumerable.ToList(extracotr.Extract(page));
23+
Assert.Equal(12, requets.Count);
2324
Assert.Contains(requets, r => r.Url == "http://cnblogs.com/sitehome/p/2");
2425
}
2526
}

src/DotnetSpider.Core/DotnetSpider.Core.csproj

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<TargetFrameworks>net451;netstandard2.0</TargetFrameworks>
44
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
55
<PackageRequireLicenseAcceptance>true</PackageRequireLicenseAcceptance>
6-
<Version>3.0.4</Version>
6+
<Version>3.0.5</Version>
77
<Authors>[email protected];</Authors>
88
<AssemblyName>DotnetSpider.Core</AssemblyName>
99
<Copyright>Copyright 2018 Lewis Zou</Copyright>
@@ -52,4 +52,4 @@
5252
<PackageReference Include="Microsoft.Extensions.Logging" Version="2.1.1" />
5353
<PackageReference Include="Serilog.Extensions.Logging" Version="2.0.2" />
5454
</ItemGroup>
55-
</Project>
55+
</Project>

src/DotnetSpider.Core/Env.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ public static class Env
156156
/// <summary>
157157
/// 配置PageProcessor是否对深度为1的链接进行正则筛选
158158
/// </summary>
159-
public static bool ProcessorFilterDefaultRequest = true;
159+
public static bool FilterDefaultRequest = true;
160160

161161
/// <summary>
162162
/// 任务唯一标识的最大长度限制

src/DotnetSpider.Core/Page.cs

+13-5
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,6 @@ public class Page : Response
1616
/// </summary>
1717
public bool Retry { get; set; }
1818

19-
/// <summary>
20-
/// 对此页面跳过解析目标链接的操作
21-
/// </summary>
22-
public bool SkipExtractedTargetRequests { get; set; }
23-
2419
/// <summary>
2520
/// 页面解析出来的目标链接不加入到调度队列中
2621
/// </summary>
@@ -191,6 +186,19 @@ public void AddTargetRequest(Request request, bool increaseDeep = true)
191186
}
192187
}
193188

189+
public Dictionary<string, dynamic> CopyProperties()
190+
{
191+
var properties = new Dictionary<string, dynamic>();
192+
foreach (var kv in Request.Properties)
193+
{
194+
if (kv.Key != Env.UrlPropertyKey && kv.Key != Env.TargetUrlPropertyKey)
195+
{
196+
properties.Add(kv.Key, kv.Value);
197+
}
198+
}
199+
return properties;
200+
}
201+
194202
private bool IsAvailable(Request request)
195203
{
196204
if (request.Url == null)
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
using Microsoft.Extensions.Logging;
1+
using DotnetSpider.Downloader;
2+
using Microsoft.Extensions.Logging;
3+
using System;
4+
using System.Collections.Generic;
25

36
namespace DotnetSpider.Core.Processor
47
{
@@ -13,9 +16,20 @@ public abstract class BasePageProcessor : IPageProcessor
1316
public ILogger Logger { get; set; }
1417

1518
/// <summary>
16-
/// 目标链接的解析器、抽取器
19+
/// 用于判断是否需要处理当前 Request, 以及解析出来的目标链接是否需要添加到队列.
20+
/// RequestExtractor 解析出来的结果也需验证是否符合 Filter, 如果不符合 Filter 那么最终也不会进入到 Processor, 即为无意义的 Request
1721
/// </summary>
18-
public ITargetRequestExtractor TargetUrlsExtractor { get; set; }
22+
public IFilter Filter { get; set; }
23+
24+
/// <summary>
25+
/// 解析目标链接的接口
26+
/// </summary>
27+
public IRequestExtractor RequestExtractor { get; set; }
28+
29+
/// <summary>
30+
/// 是否最后一页的判断接口, 如果是最后一页, 则不需要执行 RequestExtractor
31+
/// </summary>
32+
public ILastPageChecker LastPageChecker { get; set; }
1933

2034
/// <summary>
2135
/// 去掉链接#后面的所有内容
@@ -43,54 +57,51 @@ public void Process(Page page)
4357
properties[Env.UrlPropertyKey] = page.Request.Url;
4458
properties[Env.TargetUrlPropertyKey] = page.TargetUrl;
4559

46-
if (TargetUrlsExtractor != null)
60+
if (!(page.Request.GetProperty(Page.Depth) == 1 && !Env.FilterDefaultRequest))
4761
{
48-
bool isTarget = true;
49-
if ((page.Request.GetProperty(Page.Depth) != 1 || Env.ProcessorFilterDefaultRequest) && TargetUrlsExtractor.TargetUrlPatterns != null && TargetUrlsExtractor.TargetUrlPatterns.Count > 0 && !TargetUrlsExtractor.TargetUrlPatterns.Contains(null))
50-
{
51-
foreach (var regex in TargetUrlsExtractor.TargetUrlPatterns)
52-
{
53-
isTarget = regex.IsMatch(page.Request.Url);
54-
if (isTarget)
55-
{
56-
break;
57-
}
58-
}
59-
}
60-
61-
if (!isTarget)
62+
if (Filter != null && !Filter.IsMatch(page.Request))
6263
{
6364
return;
6465
}
6566
}
6667

6768
Handle(page);
6869

69-
// IAfterDownloaderHandler中可以实现解析, 有可能不再需要解析了
70-
if (!page.SkipExtractedTargetRequests && TargetUrlsExtractor != null)
71-
{
72-
ExtractUrls(page);
73-
}
74-
}
70+
if (LastPageChecker != null && LastPageChecker.IsLastPage(page)) return;
7571

76-
/// <summary>
77-
/// 解析目标链接并添加到Page对象中, 供Spider对象添加到对列中
78-
/// </summary>
79-
/// <param name="page">页面数据</param>
80-
protected virtual void ExtractUrls(Page page)
81-
{
82-
var links = TargetUrlsExtractor.ExtractRequests(page);
83-
if (links != null)
72+
IEnumerable<Request> requests;
73+
if (RequestExtractor != null && (requests = RequestExtractor.Extract(page)) != null)
8474
{
85-
foreach (var link in links)
75+
foreach (var link in requests)
8676
{
77+
if (Filter != null && !Filter.IsMatch(link)) continue;
78+
8779
if (CleanPound)
8880
{
8981
link.Url = link.Url.Split('#')[0];
9082
}
83+
9184
page.AddTargetRequest(link);
9285
}
9386
}
9487
}
88+
89+
public BasePageProcessor SetRequestExtractor(IRequestExtractor requestExtractor)
90+
{
91+
RequestExtractor = requestExtractor;
92+
return this;
93+
}
94+
95+
public BasePageProcessor SetFilter(IFilter filter)
96+
{
97+
Filter = filter;
98+
return this;
99+
}
100+
101+
public BasePageProcessor SetLastPageChecker(ILastPageChecker lastPageChecker)
102+
{
103+
LastPageChecker = lastPageChecker;
104+
return this;
105+
}
95106
}
96-
}
107+
}

src/DotnetSpider.Core/Processor/DefaultPageProcessor.cs

+2-30
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using DotnetSpider.Core.Processor.TargetRequestExtractors;
1+
using System.Collections;
2+
using System.Collections.Generic;
23

34
namespace DotnetSpider.Core.Processor
45
{
@@ -7,35 +8,6 @@ namespace DotnetSpider.Core.Processor
78
/// </summary>
89
public class DefaultPageProcessor : BasePageProcessor
910
{
10-
/// <summary>
11-
/// 构造方法
12-
/// </summary>
13-
/// <param name="partterns">匹配目标链接的正则表达式</param>
14-
/// <param name="excludeParterns">排除目标链接的正则表达式</param>
15-
public DefaultPageProcessor(string[] partterns = null, string[] excludeParterns = null)
16-
{
17-
var targetUrlsExtractor = new RegionAndPatternTargetRequestExtractor();
18-
if (partterns != null && partterns.Length > 0)
19-
{
20-
targetUrlsExtractor.AddTargetUrlExtractor(".", partterns);
21-
}
22-
if (excludeParterns != null && excludeParterns.Length > 0)
23-
{
24-
targetUrlsExtractor.AddExcludeTargetUrlPatterns(excludeParterns);
25-
}
26-
TargetUrlsExtractor = targetUrlsExtractor;
27-
}
28-
29-
/// <summary>
30-
/// 添加目标链接解析规则
31-
/// </summary>
32-
/// <param name="regionXpath">目标链接所在区域</param>
33-
/// <param name="patterns">匹配目标链接的正则表达式</param>
34-
public void AddTargetUrlExtractor(string regionXpath, params string[] patterns)
35-
{
36-
(TargetUrlsExtractor as RegionAndPatternTargetRequestExtractor)?.AddTargetUrlExtractor(regionXpath, patterns);
37-
}
38-
3911
/// <summary>
4012
/// 解析页面数据
4113
/// </summary>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text.RegularExpressions;
4+
using DotnetSpider.Downloader;
5+
6+
namespace DotnetSpider.Core.Processor.Filter
7+
{
8+
public class PatternFilter : IFilter
9+
{
10+
private readonly List<string> _patterns;
11+
private readonly List<string> _excludePaterns;
12+
13+
/// <summary>
14+
/// 构造方法
15+
/// </summary>
16+
/// <param name="patterns">需要匹配的正则</param>
17+
public PatternFilter(params string[] patterns) : this(patterns, null) { }
18+
19+
/// <summary>
20+
/// 构造方法
21+
/// </summary>
22+
/// <param name="patterns">需要匹配的正则</param>
23+
/// <param name="excludePatters">需要排除匹配的正则</param>
24+
public PatternFilter(IEnumerable<string> patterns, IEnumerable<string> excludePatters = null)
25+
{
26+
_patterns = patterns == null ? new List<string>() : new List<string>(patterns);
27+
_excludePaterns = excludePatters == null ? new List<string>() : new List<string>(excludePatters);
28+
}
29+
30+
public bool IsMatch(Request request)
31+
{
32+
if (_patterns.Count == 0 && _excludePaterns.Count == 0) return true;
33+
34+
foreach (var pattern in _excludePaterns)
35+
{
36+
if (Regex.IsMatch(request.Url, pattern))
37+
{
38+
return false;
39+
}
40+
}
41+
42+
foreach (var pattern in _patterns)
43+
{
44+
if (Regex.IsMatch(request.Url, pattern))
45+
{
46+
return true;
47+
}
48+
}
49+
50+
return false;
51+
}
52+
53+
internal bool ContainsPattern(string pattern)
54+
{
55+
return _patterns.Contains(pattern);
56+
}
57+
}
58+
}
+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
using DotnetSpider.Downloader;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
8+
namespace DotnetSpider.Core.Processor
9+
{
10+
public interface IFilter
11+
{
12+
bool IsMatch(Request request);
13+
}
14+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
using System.Threading.Tasks;
6+
7+
namespace DotnetSpider.Core.Processor
8+
{
9+
public interface ILastPageChecker
10+
{
11+
bool IsLastPage(Page page);
12+
}
13+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
using DotnetSpider.Downloader;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
8+
namespace DotnetSpider.Core.Processor
9+
{
10+
public interface IRequestExtractor
11+
{
12+
IEnumerable<Request> Extract(Page page);
13+
}
14+
}

0 commit comments

Comments
 (0)