-
Notifications
You must be signed in to change notification settings - Fork 4
/
utils.go
108 lines (87 loc) · 2.37 KB
/
utils.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
package scrape
import (
"fmt"
"io"
"net/url"
"strings"
"golang.org/x/net/html"
)
//resolveURL returns an absolute url of the extracted href
//returns default string value of failed to resolve to absolute
func resolveURL(baseURL *url.URL, href string) (*url.URL, error) {
uri, err := url.Parse(href)
if err != nil {
return nil, err
}
uri = baseURL.ResolveReference(uri)
if uri.Scheme == "" || uri.Host == "" {
return nil, fmt.Errorf("url is invalid: %s", uri.String())
}
return uri, nil
}
//normalizeHref will remove # and query params from a given url
func normalizeHref(href string, identifier string) string {
index := strings.Index(href, identifier)
if index == -1 {
return href
}
return href[:index]
}
//extractURLs will extract urls from anchor tags and img tags
func extractURLs(sourceURL *url.URL, token html.Token, key string) (urls []*url.URL, invalidURLs []string) {
for _, attr := range token.Attr {
if attr.Key == key {
href := normalizeHref(strings.TrimSpace(attr.Val), "#")
if href == "" {
invalidURLs = append(invalidURLs, attr.Val)
continue
}
uri, err := resolveURL(sourceURL, href)
if err != nil {
invalidURLs = append(invalidURLs, href)
continue
}
urls = append(urls, uri)
}
}
return urls, invalidURLs
}
//extractURLsFromHTML extracts all href urls inside a tags from an html
//does not close the reader when done
func extractURLsFromHTML(sourceURL *url.URL, httpBody io.Reader) (urls []*url.URL, invalidURLs []string) {
page := html.NewTokenizer(httpBody)
for {
tokenType := page.Next()
if tokenType == html.ErrorToken {
return urls, invalidURLs
}
if tokenType == html.StartTagToken ||
tokenType == html.SelfClosingTagToken {
token := page.Token()
switch token.DataAtom.String() {
case "a":
s, ius := extractURLs(sourceURL, token, "href")
urls = append(urls, s...)
invalidURLs = append(invalidURLs, ius...)
}
}
}
}
// urlsToStr coverts []*url.URL to []string
func urlsToStr(urls []*url.URL) (urlsStr []string) {
for _, u := range urls {
urlsStr = append(urlsStr, u.String())
}
return urlsStr
}
// urlStrToURLs converts raw urls to urlURL. returns on first error
func urlStrToURLs(urlsStr []string) (urls []*url.URL, err error) {
for _, r := range urlsStr {
u, err := url.Parse(r)
if err != nil {
return urls, err
}
urls = append(urls, u)
}
return urls, nil
}