Skip to content

Commit

Permalink
✨ feat: add css crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
shurco committed Jul 9, 2023
1 parent dbcd973 commit 5eef1df
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"request": "launch",
"mode": "auto",
"program": "${workspaceFolder}/cmd/",
"args": [""]
"args": ["-s","-r", "https://www.hellokuya.co"]
},
]
}
48 changes: 39 additions & 9 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ type filesBase struct {
css arrutil.Strings
js arrutil.Strings
img arrutil.Strings
font arrutil.Strings
}

var (
Expand Down Expand Up @@ -78,6 +79,7 @@ func CloneSite(ctx context.Context, args []string, flag Flags) error {
fmt.Printf("CSS files: %v\n", files.css.Length())
fmt.Printf("JS files: %v\n", files.js.Length())
fmt.Printf("Img files: %v\n", files.img.Length())
fmt.Printf("Font files: %v\n", files.font.Length())

if flag.Open {
cmd := open(projectPath + "/index.html")
Expand All @@ -98,7 +100,7 @@ func quotesParse(g *geziyor.Geziyor, r *client.Response) {
files = filesBase{}

body := string(r.Body)
fmt.Print("path: " + r.Response.Request.URL.Path + "\n")
fmt.Printf("page: %s://%s%s\n", projectURL.Scheme, projectURL.Host, r.Response.Request.URL.Path)

// search for all link tags that have a rel attribute that is equal to stylesheet - CSS
r.HTMLDoc.Find("link[rel='stylesheet']").Each(func(i int, s *goquery.Selection) {
Expand All @@ -109,11 +111,13 @@ func quotesParse(g *geziyor.Geziyor, r *client.Response) {
fmt.Println("Error parsing URL:", err)
}

if projectURL.Host == parsedURL.Host {
if parsedURL.Host == projectURL.Host || parsedURL.Host == "" {
fmt.Println("Css found", "-->", parsedURL)
if !files.css.Contains(parsedURL.Path) {
files.css = append(files.css, parsedURL.Path)
netutil.Extractor(parsedURL.String(), projectPath)
netutil.Extractor(projectURL.String()+parsedURL.Path, projectPath)

g.Get(r.JoinURL(projectURL.String()+parsedURL.Path), parseCSS)
}

body = strings.Replace(body, data, "/assets/css/"+filepath.Base(data), -1)
Expand All @@ -130,11 +134,31 @@ func quotesParse(g *geziyor.Geziyor, r *client.Response) {
fmt.Println("Error parsing URL:", err)
}

if projectURL.Host == parsedURL.Host {
if parsedURL.Host == projectURL.Host || parsedURL.Host == "" {
fmt.Println("Js found", "-->", parsedURL)
if !files.js.Contains(parsedURL.Path) {
files.js = append(files.js, parsedURL.Path)
netutil.Extractor(projectURL.String()+parsedURL.Path, projectPath)
}

body = strings.Replace(body, data, "/assets/js/"+filepath.Base(data), -1)
}
}
})

r.HTMLDoc.Find("link[rel='preload']").Each(func(i int, s *goquery.Selection) {
data, exists := s.Attr("href")
if exists {
parsedURL, err := url.Parse(data)
if err != nil {
fmt.Println("Error parsing URL:", err)
}

if parsedURL.Host == projectURL.Host || parsedURL.Host == "" {
fmt.Println("Js found", "-->", parsedURL)
if !files.js.Contains(parsedURL.Path) {
files.js = append(files.js, parsedURL.Path)
netutil.Extractor(parsedURL.String(), projectPath)
netutil.Extractor(projectURL.String()+parsedURL.Path, projectPath)
}

body = strings.Replace(body, data, "/assets/js/"+filepath.Base(data), -1)
Expand All @@ -150,22 +174,28 @@ func quotesParse(g *geziyor.Geziyor, r *client.Response) {
if err != nil {
fmt.Println("Error parsing URL:", err)
}
if strings.HasPrefix(parsedURL.String(), "data:image") || strings.HasPrefix(parsedURL.String(), "blob:") {
if strings.HasPrefix(projectURL.String()+parsedURL.Path, "data:image") || strings.HasPrefix(projectURL.String()+parsedURL.Path, "blob:") {
return
}

if projectURL.Host == parsedURL.Host {
if parsedURL.Host == projectURL.Host || parsedURL.Host == "" {
fmt.Println("Img found", "-->", parsedURL)
if !files.img.Contains(parsedURL.Path) {
files.img = append(files.img, parsedURL.Path)
netutil.Extractor(parsedURL.String(), projectPath)
netutil.Extractor(projectURL.String()+parsedURL.Path, projectPath)
}

body = strings.Replace(body, data, "/assets/img/"+filepath.Base(data), -1)
}
}
})

// search for all src in css in code
r.HTMLDoc.Find("style").Each(func(i int, s *goquery.Selection) {
data := s.Text()
body = readCSS(data, body)
})

r.HTMLDoc.Find("a").Each(func(i int, s *goquery.Selection) {
data, exists := s.Attr("href")
if exists {
Expand All @@ -174,7 +204,7 @@ func quotesParse(g *geziyor.Geziyor, r *client.Response) {
fmt.Println("Error parsing URL:", err)
}

if projectURL.Host == parsedURL.Host && parsedURL.Path != "" && parsedURL.Path != "/" {
if (parsedURL.Host == projectURL.Host || parsedURL.Host == "") && parsedURL.Path != "/" {
if !files.pages.Contains(parsedURL.Path) {
files.pages = append(files.pages, parsedURL.Path)
}
Expand Down
69 changes: 69 additions & 0 deletions pkg/crawler/css.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package crawler

import (
"fmt"
"log"
"net/url"
"path"
"path/filepath"
"regexp"
"strings"

"github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/client"
"github.com/shurco/goclone/pkg/fsutil"
"github.com/shurco/goclone/pkg/netutil"
)

func parseCSS(g *geziyor.Geziyor, r *client.Response) {
body := string(r.Body)
base := path.Base(r.Request.URL.Path)

index, err := fsutil.OpenFile(projectPath+"/assets/css/"+base, fsutil.FsCWFlags, 0666)
if err != nil {
log.Fatal(err)
}

if _, err := fsutil.WriteOSFile(index, readCSS(body, body)); err != nil {
log.Fatal(err)
}
}

func readCSS(data, body string) string {
lines := strings.Split(data, "\n")

for _, line := range lines {
line = strings.TrimSpace(line)
regExp, err := regexp.Compile(`url\((.*?)\)`)
if err != nil {
fmt.Println("Error compiling regex pattern:", err)
}

matches := regExp.FindAllStringSubmatch(line, -1)
for _, match := range matches {
parsedURL, err := url.Parse(match[1])
if err != nil {
fmt.Println("Error parsing URL:", err)
}

if parsedURL.Host == projectURL.Host || parsedURL.Host == "" {
folder := netutil.GetAssetDir(parsedURL.Path)
switch folder {
case "assets/font":
if !files.font.Contains(parsedURL.Path) {
files.font = append(files.font, parsedURL.Path)
netutil.Extractor(projectURL.String()+parsedURL.Path, projectPath)
}

case "assets/img":
if !files.img.Contains(parsedURL.Path) {
files.img = append(files.img, parsedURL.Path)
netutil.Extractor(projectURL.String()+parsedURL.Path, projectPath)
}
}
body = strings.Replace(body, match[1], "/"+folder+"/"+filepath.Base(match[1]), -1)
}
}
}
return body
}
32 changes: 23 additions & 9 deletions pkg/netutil/netutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,18 @@ import (

var (
extensionDir = map[string]string{
".css": "assets/css",
".js": "assets/js",
".jpg": "assets/img",
".jpeg": "assets/img",
".gif": "assets/img",
".png": "assets/img",
".svg": "assets/img",
".css": "assets/css",
".js": "assets/js",
".jpg": "assets/img",
".jpeg": "assets/img",
".gif": "assets/img",
".png": "assets/img",
".svg": "assets/img",
".eot": "assets/font",
".otf": "assets/font",
".ttf": "assets/font",
".woff": "assets/font",
".woff2": "assets/font",
}
)

Expand Down Expand Up @@ -52,12 +57,21 @@ func Extractor(link string, projectPath string) {
}
}

// GetAssetDir is ...
func GetAssetDir(filename string) string {
dirPath := extensionDir[urlExtension(filename)]
if dirPath != "" {
return dirPath
}
return ""
}

func urlExtension(URL string) string {
ext := path.Ext(URL)
if len(ext) > 5 {
match, _ := regexp.MatchString(`^[a-zA-Z]+$`, ext[1:])
match, _ := regexp.MatchString(`^[a-zA-Z0-9]+$`, ext[1:])
if !match {
ext = ext[:2] // Assuming you want to keep the first character of the extension
ext = ext[:2]
}
}
return ext
Expand Down

0 comments on commit 5eef1df

Please sign in to comment.