@@ -3,107 +3,108 @@ const request = require('request');
3
3
const URL = require ( 'url-parse' ) ;
4
4
const readline = require ( 'readline' ) ; // For user prompt to allow predictions
5
5
6
- const startUrl = 'http://www.arstechnica.com/' ;
7
- const maxPagesToVisit = 10 ;
8
- const websiteKeyword = 'alex' ;
9
-
10
- let numPagesVisited = 0 ;
11
- let pagesToVisit = [ ] ;
12
- let pagesVisited = { } ;
13
- let url = new URL ( startUrl ) ;
14
- let baseUrl = url . protocol + '//' + url . hostname ;
15
-
16
6
// Add reading user input:
17
7
const rl = readline . createInterface ( {
18
8
input : process . stdin ,
19
9
output : process . stdout
20
10
} ) ;
21
11
22
- // Search for word
23
- const searchForWord = ( $ , word ) => {
24
- let bodyText = $ ( 'html > body' ) . text ( ) . toLowerCase ( ) ;
25
-
26
- return ( bodyText . indexOf ( word . toLowerCase ( ) ) !== - 1 ) ;
27
- } ;
28
-
29
- // Collect links on the website
30
- const collectInternalLinks = ( $ ) => {
31
- let allAbsoluteLinks = [ ] ;
32
- let allRelativeLinks = [ ] ;
33
-
34
- const relativeLinks = $ ( 'a[href^=\'/\']' ) ;
35
-
36
- relativeLinks . each ( function ( ) {
37
- allRelativeLinks . push ( $ ( this ) . attr ( 'href' ) ) ;
38
-
39
- pagesToVisit . push ( baseUrl + $ ( this ) . attr ( 'href' ) ) ;
40
- } ) ;
41
-
42
- const absoluteLinks = $ ( 'a[href^=\'http\']' ) ;
43
-
44
- absoluteLinks . each ( function ( ) {
45
- allAbsoluteLinks . push ( $ ( this ) . attr ( 'href' ) ) ;
46
- } ) ;
47
-
48
- console . log ( `Found: ${ allAbsoluteLinks . length } absolute links` ) ;
49
- console . log ( `Found: ${ allRelativeLinks . length } relative links` ) ;
50
- } ;
51
-
52
-
53
- // Visit and fetch the website
54
- const visitPage = ( url , callback ) => {
55
- // Add page to our set
56
- pagesVisited [ url ] = true ;
57
- numPagesVisited ++ ;
58
-
59
- // Make the request
60
- console . log ( `Visiting page ${ url } ` ) ;
61
- request ( url , ( error , response , body ) => {
62
- // Check status code (200 is HTTP OK)
63
- console . log ( `Status code: ${ response . statusCode } ` ) ;
64
- if ( response . statusCode !== 200 ) {
65
- callback ( ) ;
66
-
67
- return ;
12
+ // Ask user for URL and keyword
13
+ rl . question ( 'Please enter the URL you want to visit: ' , ( answerUrl ) => {
14
+ rl . question ( 'Please enter the keyword you want to search for: ' , ( answerKeyword ) => {
15
+ const inputUrl = answerUrl || 'http://www.arstechnica.com/' ;
16
+ const checkedUrl = ( inputUrl . indexOf ( 'http' ) != - 1 || inputUrl . indexOf ( 'https' ) != - 1 ) ? inputUrl : 'http://' + inputUrl ;
17
+ const startUrl = checkedUrl ;
18
+ const maxPagesToVisit = 10 ;
19
+ const websiteKeyword = answerKeyword || 'alex' ;
20
+
21
+ let numPagesVisited = 0 ;
22
+ let pagesToVisit = [ ] ;
23
+ let pagesVisited = { } ;
24
+ let url = new URL ( startUrl ) ;
25
+ let baseUrl = url . protocol + '//' + url . hostname ;
26
+
27
+ // Search for word
28
+ const searchForWord = ( $ , word ) => {
29
+ let bodyText = $ ( 'html > body' ) . text ( ) . toLowerCase ( ) ;
30
+
31
+ return ( bodyText . indexOf ( word . toLowerCase ( ) ) !== - 1 ) ;
32
+ } ;
33
+
34
+ // Collect links on the website
35
+ const collectInternalLinks = ( $ ) => {
36
+ let allAbsoluteLinks = [ ] ;
37
+ let allRelativeLinks = [ ] ;
38
+
39
+ const relativeLinks = $ ( 'a[href^=\'/\']' ) ;
40
+
41
+ relativeLinks . each ( function ( ) {
42
+ allRelativeLinks . push ( $ ( this ) . attr ( 'href' ) ) ;
43
+
44
+ pagesToVisit . push ( baseUrl + $ ( this ) . attr ( 'href' ) ) ;
45
+ } ) ;
46
+
47
+ const absoluteLinks = $ ( 'a[href^=\'http\']' ) || $ ( 'a[href^=\'https\']' ) ;
48
+
49
+ absoluteLinks . each ( function ( ) {
50
+ allAbsoluteLinks . push ( $ ( this ) . attr ( 'href' ) ) ;
51
+ } ) ;
52
+
53
+ console . log ( `Found: ${ allAbsoluteLinks . length } absolute links` ) ;
54
+ console . log ( `Found: ${ allRelativeLinks . length } relative links` ) ;
55
+ } ;
56
+
57
+
58
+ // Visit and fetch the website
59
+ const visitPage = ( url , callback ) => {
60
+ // Add page to our set
61
+ pagesVisited [ url ] = true ;
62
+ numPagesVisited ++ ;
63
+
64
+ // Make the request
65
+ console . log ( `Visiting page ${ url } ` ) ;
66
+ request ( url , ( error , response , body ) => {
67
+ // Check status code (200 is HTTP OK)
68
+ console . log ( `Status code: ${ response . statusCode } ` ) ;
69
+ if ( response . statusCode !== 200 ) {
70
+ callback ( ) ;
71
+
72
+ return ;
73
+ }
74
+
75
+ // Parse the document body
76
+ const $ = cheerio . load ( body ) ;
77
+ let isWordFound = searchForWord ( $ , websiteKeyword ) ;
78
+
79
+ if ( isWordFound ) {
80
+ console . log ( `Word ${ websiteKeyword } found at page ${ url } ` ) ;
81
+ } else {
82
+ collectInternalLinks ( $ ) ;
83
+
84
+ // In this short program, our callback is just calling crawl()
85
+ callback ( ) ;
86
+ }
87
+ } ) ;
68
88
}
69
89
70
- // Parse the document body
71
- const $ = cheerio . load ( body ) ;
72
- let isWordFound = searchForWord ( $ , websiteKeyword ) ;
73
-
74
- if ( isWordFound ) {
75
- console . log ( `Word ${ websiteKeyword } found at page ${ url } ` ) ;
76
- } else {
77
- collectInternalLinks ( $ ) ;
78
-
79
- // In this short program, our callback is just calling crawl()
80
- callback ( ) ;
90
+ const crawl = ( ) => {
91
+ if ( numPagesVisited >= maxPagesToVisit ) {
92
+ console . log ( 'Reached max limit of number of pages to visit.' ) ;
93
+
94
+ return ;
95
+ }
96
+
97
+ let nextPage = pagesToVisit . pop ( ) ;
98
+ if ( nextPage in pagesVisited ) {
99
+ // We've already visited this page, so repeat the crawl
100
+ crawl ( ) ;
101
+ } else {
102
+ // New page we haven't visited
103
+ visitPage ( nextPage , crawl ) ;
104
+ }
81
105
}
82
- } ) ;
83
- }
84
-
85
- const crawl = ( ) => {
86
- if ( numPagesVisited >= maxPagesToVisit ) {
87
- console . log ( 'Reached max limit of number of pages to visit.' ) ;
88
-
89
- return ;
90
- }
91
106
92
- let nextPage = pagesToVisit . pop ( ) ;
93
- if ( nextPage in pagesVisited ) {
94
- // We've already visited this page, so repeat the crawl
107
+ pagesToVisit . push ( startUrl ) ;
95
108
crawl ( ) ;
96
- } else {
97
- // New page we haven't visited
98
- visitPage ( nextPage , crawl ) ;
99
- }
100
- }
101
-
102
- pagesToVisit . push ( startUrl ) ;
103
- crawl ( ) ;
104
-
105
- // Ask user for URL - to be implemented
106
- // rl.question('Please enter the URL you want to visit: ', (answer) => {
107
- // pagesToVisit.push(answer);
108
- // crawl();
109
- // });
109
+ } ) ;
110
+ } ) ;
0 commit comments