@@ -18,7 +18,7 @@ public class Selectable : AbstractSelectable
18
18
/// <param name="removeOutboundLinks">是否去除外链</param>
19
19
public Selectable ( string html , string url , bool removeOutboundLinks = true )
20
20
{
21
- HtmlDocument document = new HtmlDocument { OptionAutoCloseOnEnd = true } ;
21
+ HtmlDocument document = new HtmlDocument { OptionAutoCloseOnEnd = true } ;
22
22
document . LoadHtml ( html ) ;
23
23
24
24
if ( ! string . IsNullOrWhiteSpace ( url ) )
@@ -32,7 +32,8 @@ public Selectable(string html, string url, bool removeOutboundLinks = true)
32
32
RemoveOutboundLinks ( document , domain ) ;
33
33
}
34
34
}
35
- Elements = new List < dynamic > { document . DocumentNode . OuterHtml } ;
35
+
36
+ Elements = new List < dynamic > { document . DocumentNode . OuterHtml } ;
36
37
}
37
38
38
39
/// <summary>
@@ -41,7 +42,7 @@ public Selectable(string html, string url, bool removeOutboundLinks = true)
41
42
/// <param name="json">Json</param>
42
43
public Selectable ( string json )
43
44
{
44
- Elements = new List < dynamic > { json } ;
45
+ Elements = new List < dynamic > { json } ;
45
46
}
46
47
47
48
/// <summary>
@@ -86,29 +87,29 @@ public override dynamic Environment(string field)
86
87
switch ( key )
87
88
{
88
89
case "now" :
89
- {
90
- return DateTime . Now . ToString ( "yyyy/MM/dd hh:mm:ss" ) ;
91
- }
90
+ {
91
+ return DateTime . Now . ToString ( "yyyy/MM/dd hh:mm:ss" ) ;
92
+ }
92
93
case "monday" :
93
- {
94
- var now = DateTime . Now ;
95
- int i = now . DayOfWeek - DayOfWeek . Monday == - 1 ? 6 : - 1 ;
96
- TimeSpan ts = new TimeSpan ( i , 0 , 0 , 0 ) ;
97
- return now . Subtract ( ts ) . Date . ToString ( "yyyy/MM/dd hh:mm:ss" ) ;
98
- }
94
+ {
95
+ var now = DateTime . Now ;
96
+ int i = now . DayOfWeek - DayOfWeek . Monday == - 1 ? 6 : - 1 ;
97
+ TimeSpan ts = new TimeSpan ( i , 0 , 0 , 0 ) ;
98
+ return now . Subtract ( ts ) . Date . ToString ( "yyyy/MM/dd hh:mm:ss" ) ;
99
+ }
99
100
case "today" :
100
- {
101
- return DateTime . Now . Date . ToString ( "yyyy/MM/dd hh:mm:ss" ) ;
102
- }
101
+ {
102
+ return DateTime . Now . Date . ToString ( "yyyy/MM/dd hh:mm:ss" ) ;
103
+ }
103
104
case "monthly" :
104
- {
105
- var now = DateTime . Now ;
106
- return now . AddDays ( now . Day * - 1 + 1 ) . ToString ( "yyyy/MM/dd hh:mm:ss" ) ;
107
- }
105
+ {
106
+ var now = DateTime . Now ;
107
+ return now . AddDays ( now . Day * - 1 + 1 ) . ToString ( "yyyy/MM/dd hh:mm:ss" ) ;
108
+ }
108
109
default :
109
- {
110
- return Properties . ContainsKey ( field ) ? Properties [ field ] : null ;
111
- }
110
+ {
111
+ return Properties . ContainsKey ( field ) ? Properties [ field ] : null ;
112
+ }
112
113
}
113
114
}
114
115
@@ -128,13 +129,15 @@ public override ISelectable Links()
128
129
results . Add ( link ) ;
129
130
}
130
131
}
132
+
131
133
foreach ( var link in sourceLinks )
132
134
{
133
135
if ( Uri . TryCreate ( link , UriKind . RelativeOrAbsolute , out _ ) )
134
136
{
135
137
results . Add ( link ) ;
136
138
}
137
139
}
140
+
138
141
return new Selectable ( results . ToList ( ) ) ;
139
142
}
140
143
@@ -166,8 +169,10 @@ public override ISelectable Select(ISelector selector)
166
169
results . Add ( result ) ;
167
170
}
168
171
}
172
+
169
173
return new Selectable ( results ) ;
170
174
}
175
+
171
176
throw new ExtractionException ( $ "{ nameof ( selector ) } is null.") ;
172
177
}
173
178
@@ -189,6 +194,7 @@ public override ISelectable SelectList(ISelector selector)
189
194
results . AddRange ( result ) ;
190
195
}
191
196
}
197
+
192
198
return new Selectable ( results ) ;
193
199
}
194
200
@@ -204,8 +210,9 @@ public override IEnumerable<ISelectable> Nodes()
204
210
List < ISelectable > result = new List < ISelectable > ( ) ;
205
211
foreach ( var element in Elements )
206
212
{
207
- result . Add ( new Selectable ( new List < dynamic > ( ) { element } ) ) ;
213
+ result . Add ( new Selectable ( new List < dynamic > ( ) { element } ) ) ;
208
214
}
215
+
209
216
return result ;
210
217
}
211
218
@@ -242,26 +249,28 @@ public static string CanonicalizeUrl(string url, string refer)
242
249
243
250
private void FixAllRelativeHref ( HtmlDocument document , string url )
244
251
{
245
- var nodes = document . DocumentNode . SelectNodes ( "//a[not(starts-with( @href,'http') or starts-with(@href,'https'))] " ) ;
246
- if ( nodes != null )
252
+ var hrefNodes = document . DocumentNode . SelectNodes ( ".// @href" ) ;
253
+ if ( hrefNodes != null )
247
254
{
248
- foreach ( var node in nodes )
255
+ foreach ( var node in hrefNodes )
249
256
{
250
- if ( node . Attributes [ "href" ] != null )
257
+ var href = node . Attributes [ "href" ] . Value ;
258
+ if ( ! string . IsNullOrWhiteSpace ( href ) && ! href . Contains ( "http" ) && ! href . Contains ( "https" ) )
251
259
{
252
- node . Attributes [ "href" ] . Value = CanonicalizeUrl ( node . Attributes [ " href" ] . Value , url ) ;
260
+ node . Attributes [ "href" ] . Value = CanonicalizeUrl ( href , url ) ;
253
261
}
254
262
}
255
263
}
256
264
257
- var images = document . DocumentNode . SelectNodes ( ".//img " ) ;
258
- if ( images != null )
265
+ var srcNodes = document . DocumentNode . SelectNodes ( ".//@src " ) ;
266
+ if ( srcNodes != null )
259
267
{
260
- foreach ( var image in images )
268
+ foreach ( var node in srcNodes )
261
269
{
262
- if ( image . Attributes [ "src" ] != null )
270
+ var src = node . Attributes [ "src" ] . Value ;
271
+ if ( ! string . IsNullOrWhiteSpace ( src ) && ! src . Contains ( "http" ) && ! src . Contains ( "https" ) )
263
272
{
264
- image . Attributes [ "src" ] . Value = CanonicalizeUrl ( image . Attributes [ " src" ] . Value , url ) ;
273
+ node . Attributes [ "src" ] . Value = CanonicalizeUrl ( src , url ) ;
265
274
}
266
275
}
267
276
}
@@ -279,17 +288,20 @@ private void RemoveOutboundLinks(HtmlDocument document, params string[] domains)
279
288
foreach ( var domain in domains )
280
289
{
281
290
var href = node . Attributes [ "href" ] ? . Value ;
282
- if ( ! string . IsNullOrWhiteSpace ( href ) && System . Text . RegularExpressions . Regex . IsMatch ( href , domain ) )
291
+ if ( ! string . IsNullOrWhiteSpace ( href ) &&
292
+ System . Text . RegularExpressions . Regex . IsMatch ( href , domain ) )
283
293
{
284
294
isMatch = true ;
285
295
break ;
286
296
}
287
297
}
298
+
288
299
if ( ! isMatch )
289
300
{
290
301
deleteNodes . Add ( node ) ;
291
302
}
292
303
}
304
+
293
305
foreach ( var node in deleteNodes )
294
306
{
295
307
node . Remove ( ) ;
0 commit comments