Skip to content

Commit cfd674f

Browse files
LewisLewis
Lewis
authored and
Lewis
committed
fix some a tag can't be fix domain
1 parent df2f8b4 commit cfd674f

File tree

1 file changed

+46
-34
lines changed

1 file changed

+46
-34
lines changed

src/DotnetSpider.Extraction/Selectable.cs

+46-34
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ public class Selectable : AbstractSelectable
1818
/// <param name="removeOutboundLinks">是否去除外链</param>
1919
public Selectable(string html, string url, bool removeOutboundLinks = true)
2020
{
21-
HtmlDocument document = new HtmlDocument { OptionAutoCloseOnEnd = true };
21+
HtmlDocument document = new HtmlDocument {OptionAutoCloseOnEnd = true};
2222
document.LoadHtml(html);
2323

2424
if (!string.IsNullOrWhiteSpace(url))
@@ -32,7 +32,8 @@ public Selectable(string html, string url, bool removeOutboundLinks = true)
3232
RemoveOutboundLinks(document, domain);
3333
}
3434
}
35-
Elements = new List<dynamic> { document.DocumentNode.OuterHtml };
35+
36+
Elements = new List<dynamic> {document.DocumentNode.OuterHtml};
3637
}
3738

3839
/// <summary>
@@ -41,7 +42,7 @@ public Selectable(string html, string url, bool removeOutboundLinks = true)
4142
/// <param name="json">Json</param>
4243
public Selectable(string json)
4344
{
44-
Elements = new List<dynamic> { json };
45+
Elements = new List<dynamic> {json};
4546
}
4647

4748
/// <summary>
@@ -86,29 +87,29 @@ public override dynamic Environment(string field)
8687
switch (key)
8788
{
8889
case "now":
89-
{
90-
return DateTime.Now.ToString("yyyy/MM/dd hh:mm:ss");
91-
}
90+
{
91+
return DateTime.Now.ToString("yyyy/MM/dd hh:mm:ss");
92+
}
9293
case "monday":
93-
{
94-
var now = DateTime.Now;
95-
int i = now.DayOfWeek - DayOfWeek.Monday == -1 ? 6 : -1;
96-
TimeSpan ts = new TimeSpan(i, 0, 0, 0);
97-
return now.Subtract(ts).Date.ToString("yyyy/MM/dd hh:mm:ss");
98-
}
94+
{
95+
var now = DateTime.Now;
96+
int i = now.DayOfWeek - DayOfWeek.Monday == -1 ? 6 : -1;
97+
TimeSpan ts = new TimeSpan(i, 0, 0, 0);
98+
return now.Subtract(ts).Date.ToString("yyyy/MM/dd hh:mm:ss");
99+
}
99100
case "today":
100-
{
101-
return DateTime.Now.Date.ToString("yyyy/MM/dd hh:mm:ss");
102-
}
101+
{
102+
return DateTime.Now.Date.ToString("yyyy/MM/dd hh:mm:ss");
103+
}
103104
case "monthly":
104-
{
105-
var now = DateTime.Now;
106-
return now.AddDays(now.Day * -1 + 1).ToString("yyyy/MM/dd hh:mm:ss");
107-
}
105+
{
106+
var now = DateTime.Now;
107+
return now.AddDays(now.Day * -1 + 1).ToString("yyyy/MM/dd hh:mm:ss");
108+
}
108109
default:
109-
{
110-
return Properties.ContainsKey(field) ? Properties[field] : null;
111-
}
110+
{
111+
return Properties.ContainsKey(field) ? Properties[field] : null;
112+
}
112113
}
113114
}
114115

@@ -128,13 +129,15 @@ public override ISelectable Links()
128129
results.Add(link);
129130
}
130131
}
132+
131133
foreach (var link in sourceLinks)
132134
{
133135
if (Uri.TryCreate(link, UriKind.RelativeOrAbsolute, out _))
134136
{
135137
results.Add(link);
136138
}
137139
}
140+
138141
return new Selectable(results.ToList());
139142
}
140143

@@ -166,8 +169,10 @@ public override ISelectable Select(ISelector selector)
166169
results.Add(result);
167170
}
168171
}
172+
169173
return new Selectable(results);
170174
}
175+
171176
throw new ExtractionException($"{nameof(selector)} is null.");
172177
}
173178

@@ -189,6 +194,7 @@ public override ISelectable SelectList(ISelector selector)
189194
results.AddRange(result);
190195
}
191196
}
197+
192198
return new Selectable(results);
193199
}
194200

@@ -204,8 +210,9 @@ public override IEnumerable<ISelectable> Nodes()
204210
List<ISelectable> result = new List<ISelectable>();
205211
foreach (var element in Elements)
206212
{
207-
result.Add(new Selectable(new List<dynamic>() { element }));
213+
result.Add(new Selectable(new List<dynamic>() {element}));
208214
}
215+
209216
return result;
210217
}
211218

@@ -242,26 +249,28 @@ public static string CanonicalizeUrl(string url, string refer)
242249

243250
private void FixAllRelativeHref(HtmlDocument document, string url)
244251
{
245-
var nodes = document.DocumentNode.SelectNodes("//a[not(starts-with(@href,'http') or starts-with(@href,'https'))]");
246-
if (nodes != null)
252+
var hrefNodes = document.DocumentNode.SelectNodes(".//@href");
253+
if (hrefNodes != null)
247254
{
248-
foreach (var node in nodes)
255+
foreach (var node in hrefNodes)
249256
{
250-
if (node.Attributes["href"] != null)
257+
var href = node.Attributes["href"].Value;
258+
if (!string.IsNullOrWhiteSpace(href) && !href.Contains("http") && !href.Contains("https"))
251259
{
252-
node.Attributes["href"].Value = CanonicalizeUrl(node.Attributes["href"].Value, url);
260+
node.Attributes["href"].Value = CanonicalizeUrl(href, url);
253261
}
254262
}
255263
}
256264

257-
var images = document.DocumentNode.SelectNodes(".//img");
258-
if (images != null)
265+
var srcNodes = document.DocumentNode.SelectNodes(".//@src");
266+
if (srcNodes != null)
259267
{
260-
foreach (var image in images)
268+
foreach (var node in srcNodes)
261269
{
262-
if (image.Attributes["src"] != null)
270+
var src = node.Attributes["src"].Value;
271+
if (!string.IsNullOrWhiteSpace(src) && !src.Contains("http") && !src.Contains("https"))
263272
{
264-
image.Attributes["src"].Value = CanonicalizeUrl(image.Attributes["src"].Value, url);
273+
node.Attributes["src"].Value = CanonicalizeUrl(src, url);
265274
}
266275
}
267276
}
@@ -279,17 +288,20 @@ private void RemoveOutboundLinks(HtmlDocument document, params string[] domains)
279288
foreach (var domain in domains)
280289
{
281290
var href = node.Attributes["href"]?.Value;
282-
if (!string.IsNullOrWhiteSpace(href) && System.Text.RegularExpressions.Regex.IsMatch(href, domain))
291+
if (!string.IsNullOrWhiteSpace(href) &&
292+
System.Text.RegularExpressions.Regex.IsMatch(href, domain))
283293
{
284294
isMatch = true;
285295
break;
286296
}
287297
}
298+
288299
if (!isMatch)
289300
{
290301
deleteNodes.Add(node);
291302
}
292303
}
304+
293305
foreach (var node in deleteNodes)
294306
{
295307
node.Remove();

0 commit comments

Comments
 (0)