1
- using Microsoft . Extensions . Logging ;
1
+ using DotnetSpider . Downloader ;
2
+ using Microsoft . Extensions . Logging ;
3
+ using System ;
4
+ using System . Collections . Generic ;
2
5
3
6
namespace DotnetSpider . Core . Processor
4
7
{
@@ -13,9 +16,20 @@ public abstract class BasePageProcessor : IPageProcessor
13
16
public ILogger Logger { get ; set ; }
14
17
15
18
/// <summary>
16
- /// 目标链接的解析器、抽取器
19
+ /// 用于判断是否需要处理当前 Request, 以及解析出来的目标链接是否需要添加到队列.
20
+ /// RequestExtractor 解析出来的结果也需验证是否符合 Filter, 如果不符合 Filter 那么最终也不会进入到 Processor, 即为无意义的 Request
17
21
/// </summary>
18
- public ITargetRequestExtractor TargetUrlsExtractor { get ; set ; }
22
+ public IFilter Filter { get ; set ; }
23
+
24
+ /// <summary>
25
+ /// 解析目标链接的接口
26
+ /// </summary>
27
+ public IRequestExtractor RequestExtractor { get ; set ; }
28
+
29
+ /// <summary>
30
+ /// 是否最后一页的判断接口, 如果是最后一页, 则不需要执行 RequestExtractor
31
+ /// </summary>
32
+ public ILastPageChecker LastPageChecker { get ; set ; }
19
33
20
34
/// <summary>
21
35
/// 去掉链接#后面的所有内容
@@ -43,54 +57,51 @@ public void Process(Page page)
43
57
properties [ Env . UrlPropertyKey ] = page . Request . Url ;
44
58
properties [ Env . TargetUrlPropertyKey ] = page . TargetUrl ;
45
59
46
- if ( TargetUrlsExtractor != null )
60
+ if ( ! ( page . Request . GetProperty ( Page . Depth ) == 1 && ! Env . FilterDefaultRequest ) )
47
61
{
48
- bool isTarget = true ;
49
- if ( ( page . Request . GetProperty ( Page . Depth ) != 1 || Env . ProcessorFilterDefaultRequest ) && TargetUrlsExtractor . TargetUrlPatterns != null && TargetUrlsExtractor . TargetUrlPatterns . Count > 0 && ! TargetUrlsExtractor . TargetUrlPatterns . Contains ( null ) )
50
- {
51
- foreach ( var regex in TargetUrlsExtractor . TargetUrlPatterns )
52
- {
53
- isTarget = regex . IsMatch ( page . Request . Url ) ;
54
- if ( isTarget )
55
- {
56
- break ;
57
- }
58
- }
59
- }
60
-
61
- if ( ! isTarget )
62
+ if ( Filter != null && ! Filter . IsMatch ( page . Request ) )
62
63
{
63
64
return ;
64
65
}
65
66
}
66
67
67
68
Handle ( page ) ;
68
69
69
- // IAfterDownloaderHandler中可以实现解析, 有可能不再需要解析了
70
- if ( ! page . SkipExtractedTargetRequests && TargetUrlsExtractor != null )
71
- {
72
- ExtractUrls ( page ) ;
73
- }
74
- }
70
+ if ( LastPageChecker != null && LastPageChecker . IsLastPage ( page ) ) return ;
75
71
76
- /// <summary>
77
- /// 解析目标链接并添加到Page对象中, 供Spider对象添加到对列中
78
- /// </summary>
79
- /// <param name="page">页面数据</param>
80
- protected virtual void ExtractUrls ( Page page )
81
- {
82
- var links = TargetUrlsExtractor . ExtractRequests ( page ) ;
83
- if ( links != null )
72
+ IEnumerable < Request > requests ;
73
+ if ( RequestExtractor != null && ( requests = RequestExtractor . Extract ( page ) ) != null )
84
74
{
85
- foreach ( var link in links )
75
+ foreach ( var link in requests )
86
76
{
77
+ if ( Filter != null && ! Filter . IsMatch ( link ) ) continue ;
78
+
87
79
if ( CleanPound )
88
80
{
89
81
link . Url = link . Url . Split ( '#' ) [ 0 ] ;
90
82
}
83
+
91
84
page . AddTargetRequest ( link ) ;
92
85
}
93
86
}
94
87
}
88
+
89
+ public BasePageProcessor SetRequestExtractor ( IRequestExtractor requestExtractor )
90
+ {
91
+ RequestExtractor = requestExtractor ;
92
+ return this ;
93
+ }
94
+
95
+ public BasePageProcessor SetFilter ( IFilter filter )
96
+ {
97
+ Filter = filter ;
98
+ return this ;
99
+ }
100
+
101
+ public BasePageProcessor SetLastPageChecker ( ILastPageChecker lastPageChecker )
102
+ {
103
+ LastPageChecker = lastPageChecker ;
104
+ return this ;
105
+ }
95
106
}
96
- }
107
+ }
0 commit comments