@@ -40,54 +40,26 @@ protected override async Task InitializeAsync(CancellationToken stoppingToken =
40
40
{
41
41
AddDataFlow ( new ListNewsParser ( ) ) ;
42
42
AddDataFlow ( new NewsParser ( ) ) ;
43
- AddDataFlow ( new MyConsoleStorage ( ) ) ;
44
- await AddRequestsAsync ( new Request ( "https://news.cnblogs.com/n/page/1/" ) ) ;
43
+ var request = new Request ( "https://news.cnblogs.com/n/page/1" )
44
+ {
45
+ } ;
46
+ request . Headers . UserAgent = "" ;
47
+ await AddRequestsAsync ( request ) ;
45
48
}
46
49
47
50
protected override SpiderId GenerateSpiderId ( )
48
51
{
49
52
return new ( ObjectId . CreateId ( ) . ToString ( ) , "博客园" ) ;
50
53
}
51
54
52
- protected class MyConsoleStorage : DataFlowBase
53
- {
54
- public override Task InitializeAsync ( )
55
- {
56
- return Task . CompletedTask ;
57
- }
58
-
59
- public override Task HandleAsync ( DataFlowContext context )
60
- {
61
- if ( IsNullOrEmpty ( context ) )
62
- {
63
- Logger . LogWarning ( "数据流上下文不包含解析结果" ) ;
64
- return Task . CompletedTask ;
65
- }
66
-
67
- var typeName = typeof ( News ) . FullName ;
68
- var data = context . GetData ( typeName ) ;
69
- if ( data is News news )
70
- {
71
- Console . WriteLine ( $ "URL: { news . Url } , TITLE: { news . Title } , VIEWS: { news . Views } ") ;
72
- }
73
-
74
- return Task . CompletedTask ;
75
- }
76
- }
77
55
78
56
protected class ListNewsParser : DataParser
79
57
{
80
58
public override Task InitializeAsync ( )
81
59
{
82
- // AddRequiredValidator("news\\.cnblogs\\.com/n/page");
83
- AddRequiredValidator ( ( request =>
84
- {
85
- var host = request . RequestUri . Host ;
86
- var regex = host + "/$" ;
87
- return Regex . IsMatch ( request . RequestUri . ToString ( ) , regex ) ;
88
- } ) ) ;
60
+ AddRequiredValidator ( "news\\ .cnblogs\\ .com/n/page" ) ;
89
61
// if you want to collect every pages
90
- // AddFollowRequestQuerier(Selectors.XPath(".//div[@class='pager']"));
62
+ AddFollowRequestQuerier ( Selectors . XPath ( ".//div[@class='pager']" ) ) ;
91
63
return Task . CompletedTask ;
92
64
}
93
65
@@ -128,16 +100,22 @@ public override Task InitializeAsync()
128
100
protected override Task ParseAsync ( DataFlowContext context )
129
101
{
130
102
var typeName = typeof ( News ) . FullName ;
103
+ var url = context . Request . RequestUri . ToString ( ) ;
104
+ var title = context . Request . Properties [ "title" ] ? . ToString ( ) ? . Trim ( ) ;
105
+ var summary = context . Request . Properties [ "summary" ] ? . ToString ( ) ? . Trim ( ) ;
106
+ var views = int . Parse ( context . Request . Properties [ "views" ] ? . ToString ( ) ? . Trim ( ) ?? "0" ) ;
107
+ var content = context . Selectable . Select ( Selectors . XPath ( ".//div[@id='news_body']" ) ) ? . Value
108
+ ? . Trim ( ) ;
131
109
context . AddData ( typeName ,
132
110
new News
133
111
{
134
- Url = context . Request . RequestUri . ToString ( ) ,
135
- Title = context . Request . Properties [ "title" ] ? . ToString ( ) ? . Trim ( ) ,
136
- Summary = context . Request . Properties [ "summary" ] ? . ToString ( ) ? . Trim ( ) ,
137
- Views = int . Parse ( context . Request . Properties [ "views" ] ? . ToString ( ) ? . Trim ( ) ?? "0" ) ,
138
- Content = context . Selectable . Select ( Selectors . XPath ( ".//div[@id='news_body']" ) ) . Value
139
- ? . Trim ( )
112
+ Url = url ,
113
+ Title = title ,
114
+ Summary = summary ,
115
+ Views = views ,
116
+ Content = content
140
117
} ) ;
118
+
141
119
return Task . CompletedTask ;
142
120
}
143
121
}
0 commit comments