Skip to content

Commit 8398679

Browse files
committed
Ask the user for input
1 parent 98725f1 commit 8398679

File tree

1 file changed

+94
-93
lines changed

1 file changed

+94
-93
lines changed

src/crawler.js

+94-93
Original file line numberDiff line numberDiff line change
@@ -3,107 +3,108 @@ const request = require('request');
33
const URL = require('url-parse');
44
const readline = require('readline'); // For user prompt to allow predictions
55

6-
const startUrl = 'http://www.arstechnica.com/';
7-
const maxPagesToVisit = 10;
8-
const websiteKeyword = 'alex';
9-
10-
let numPagesVisited = 0;
11-
let pagesToVisit = [];
12-
let pagesVisited = {};
13-
let url = new URL(startUrl);
14-
let baseUrl = url.protocol + '//' + url.hostname;
15-
166
// Add reading user input:
177
const rl = readline.createInterface({
188
input: process.stdin,
199
output: process.stdout
2010
});
2111

22-
// Search for word
23-
const searchForWord = ($, word) => {
24-
let bodyText = $('html > body').text().toLowerCase();
25-
26-
return(bodyText.indexOf(word.toLowerCase()) !== -1);
27-
};
28-
29-
// Collect links on the website
30-
const collectInternalLinks = ($) => {
31-
let allAbsoluteLinks = [];
32-
let allRelativeLinks = [];
33-
34-
const relativeLinks = $('a[href^=\'/\']');
35-
36-
relativeLinks.each(function() {
37-
allRelativeLinks.push($(this).attr('href'));
38-
39-
pagesToVisit.push(baseUrl + $(this).attr('href'));
40-
});
41-
42-
const absoluteLinks = $('a[href^=\'http\']');
43-
44-
absoluteLinks.each(function() {
45-
allAbsoluteLinks.push($(this).attr('href'));
46-
});
47-
48-
console.log(`Found: ${allAbsoluteLinks.length} absolute links`);
49-
console.log(`Found: ${allRelativeLinks.length} relative links`);
50-
};
51-
52-
53-
// Visit and fetch the website
54-
const visitPage = (url, callback) => {
55-
// Add page to our set
56-
pagesVisited[url] = true;
57-
numPagesVisited++;
58-
59-
// Make the request
60-
console.log(`Visiting page ${url}`);
61-
request(url, (error, response, body) => {
62-
// Check status code (200 is HTTP OK)
63-
console.log(`Status code: ${response.statusCode}`);
64-
if(response.statusCode !== 200) {
65-
callback();
66-
67-
return;
12+
// Ask user for URL and keyword
13+
rl.question('Please enter the URL you want to visit: ', (answerUrl) => {
14+
rl.question('Please enter the keyword you want to search for: ', (answerKeyword) => {
15+
const inputUrl = answerUrl || 'http://www.arstechnica.com/';
16+
const checkedUrl = (inputUrl.indexOf('http') != -1 || inputUrl.indexOf('https') != -1) ? inputUrl : 'http://' + inputUrl;
17+
const startUrl = checkedUrl;
18+
const maxPagesToVisit = 10;
19+
const websiteKeyword = answerKeyword || 'alex';
20+
21+
let numPagesVisited = 0;
22+
let pagesToVisit = [];
23+
let pagesVisited = {};
24+
let url = new URL(startUrl);
25+
let baseUrl = url.protocol + '//' + url.hostname;
26+
27+
// Search for word
28+
const searchForWord = ($, word) => {
29+
let bodyText = $('html > body').text().toLowerCase();
30+
31+
return(bodyText.indexOf(word.toLowerCase()) !== -1);
32+
};
33+
34+
// Collect links on the website
35+
const collectInternalLinks = ($) => {
36+
let allAbsoluteLinks = [];
37+
let allRelativeLinks = [];
38+
39+
const relativeLinks = $('a[href^=\'/\']');
40+
41+
relativeLinks.each(function() {
42+
allRelativeLinks.push($(this).attr('href'));
43+
44+
pagesToVisit.push(baseUrl + $(this).attr('href'));
45+
});
46+
47+
const absoluteLinks = $('a[href^=\'http\']') || $('a[href^=\'https\']');
48+
49+
absoluteLinks.each(function() {
50+
allAbsoluteLinks.push($(this).attr('href'));
51+
});
52+
53+
console.log(`Found: ${allAbsoluteLinks.length} absolute links`);
54+
console.log(`Found: ${allRelativeLinks.length} relative links`);
55+
};
56+
57+
58+
// Visit and fetch the website
59+
const visitPage = (url, callback) => {
60+
// Add page to our set
61+
pagesVisited[url] = true;
62+
numPagesVisited++;
63+
64+
// Make the request
65+
console.log(`Visiting page ${url}`);
66+
request(url, (error, response, body) => {
67+
// Check status code (200 is HTTP OK)
68+
console.log(`Status code: ${response.statusCode}`);
69+
if(response.statusCode !== 200) {
70+
callback();
71+
72+
return;
73+
}
74+
75+
// Parse the document body
76+
const $ = cheerio.load(body);
77+
let isWordFound = searchForWord($, websiteKeyword);
78+
79+
if(isWordFound) {
80+
console.log(`Word ${websiteKeyword} found at page ${url}`);
81+
} else {
82+
collectInternalLinks($);
83+
84+
// In this short program, our callback is just calling crawl()
85+
callback();
86+
}
87+
});
6888
}
6989

70-
// Parse the document body
71-
const $ = cheerio.load(body);
72-
let isWordFound = searchForWord($, websiteKeyword);
73-
74-
if(isWordFound) {
75-
console.log(`Word ${websiteKeyword} found at page ${url}`);
76-
} else {
77-
collectInternalLinks($);
78-
79-
// In this short program, our callback is just calling crawl()
80-
callback();
90+
const crawl = () => {
91+
if(numPagesVisited >= maxPagesToVisit) {
92+
console.log('Reached max limit of number of pages to visit.');
93+
94+
return;
95+
}
96+
97+
let nextPage = pagesToVisit.pop();
98+
if (nextPage in pagesVisited) {
99+
// We've already visited this page, so repeat the crawl
100+
crawl();
101+
} else {
102+
// New page we haven't visited
103+
visitPage(nextPage, crawl);
104+
}
81105
}
82-
});
83-
}
84-
85-
const crawl = () => {
86-
if(numPagesVisited >= maxPagesToVisit) {
87-
console.log('Reached max limit of number of pages to visit.');
88-
89-
return;
90-
}
91106

92-
let nextPage = pagesToVisit.pop();
93-
if (nextPage in pagesVisited) {
94-
// We've already visited this page, so repeat the crawl
107+
pagesToVisit.push(startUrl);
95108
crawl();
96-
} else {
97-
// New page we haven't visited
98-
visitPage(nextPage, crawl);
99-
}
100-
}
101-
102-
pagesToVisit.push(startUrl);
103-
crawl();
104-
105-
// Ask user for URL - to be implemented
106-
// rl.question('Please enter the URL you want to visit: ', (answer) => {
107-
// pagesToVisit.push(answer);
108-
// crawl();
109-
// });
109+
});
110+
});

0 commit comments

Comments
 (0)