diff --git a/crawler.js b/crawler.js index 018d26f..8c221fb 100644 --- a/crawler.js +++ b/crawler.js @@ -76,6 +76,7 @@ class BlockCrawler extends EventEmitter { } var shouldCrawl = this.shouldCrawl; var allowedDomains = this.allowedDomains; + var forceHttps = this.forceHttps; return function(context) { var $; @@ -93,7 +94,11 @@ class BlockCrawler extends EventEmitter { targetHref = $this.attr("href"); absoluteTargetUrl = urlMod.resolve(context.url, targetHref); urlObj = urlMod.parse(absoluteTargetUrl); - protocol = urlObj.protocol; + if(forceHttps){ + protocol = 'https' + }else{ + protocol = urlObj.protocol; + } hostname = urlObj.hostname; @@ -121,6 +126,7 @@ class BlockCrawler extends EventEmitter { this.proxyUri = argv.proxy; this.redisserver = argv.redisserver; this.debug = argv.debug; + this.forceHttps = argv.force_https; var _allowed_domains = argv.allowed_domains; if (undefined != _allowed_domains) { try{ diff --git a/index.js b/index.js index 22222f6..130be2d 100644 --- a/index.js +++ b/index.js @@ -47,6 +47,11 @@ const argv = require('yargs') type: 'string', description: 'hostname:UrlPattern JSON object for allowed domains' }) + .option('force_https',{ + type: 'boolean', + description: "Force https instead of http", + default: true + }) //.demandCommand(1) .argv;