-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScraper.js
135 lines (109 loc) · 3.92 KB
/
Scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
var Q = require('q');
var Repositories = require('./models/Repositories.js');
var Users = require('./models/Users.js');
var color = require('colors');
var DAYS_PER_YEAR = 365;
var DAYS_TO_SCRAPE = 2;
Scraper = {}
Scraper.start = function(){
}
Scraper.run = function() {
return Q.Promise(function(resolve, reject, notify){
// notify(progress);
//this starts going back and getting repos created and stores the repo and their creator
// _getRepoAndUserData();
//this starts looking at the repos that we found and seeing if they contain interesting code snippets.
_getCodeData();
//this starts querying for users and how they follow eachother.
// _getUserNetworkData();
// _famousRepoRoutine();
});
}
function _getCodeData(){
Repositories.needScan() //gets the top repos that should be scanned
.then(Repositories.scanAll) //scans them for code snippets that match signatures
.then(Code.saveAll) //saves those snippets, flagging the repos.
.then(function(saved){
_getCodeData();
});
}
function _getRepoAndUserData(){
//TODO maybe you should try to find the most recent date in repositories, and start from there going forwards?
var today = new Date();
Data.getLastRepoCreationScanDate().then(function(lastScanDate){
lastScanDate.setDate(lastScanDate.getDate()); //bump to after the last scanned date.
promiseForGettingRepos(lastScanDate, today).then(function(done){
debug('DONE = ', done);
}, console.log);
});
}
function _famousRepoRoutine(){
/*
beta users are your customeres
we need to answer those questions
aka not doing customer service.
repositories are not acive, look at their push frequencies across the year,
find the most active repos,
look at creation frequencies this past year, when were more repos created?
is conversion rate bad?
for reach repo flagged famous, find its user
look at all the users repos
get the languages of each repo
for each repo, grab the frequency of pushes,
see how famous ranks in activity compare to a user's other repos
what do famous users spend time on?
people who have famous reposiories have a k factor of ?
how many followeres?
are they influencial?
*/
//get all repos that have famous_confirmed flagged
//get its user
//get all that user's repositories
//
Data.getCodeForRepository(39080).then(console.log);
}
function _famousRepoValidityCheck(n) {
Data.getFamousRepos(1)
.then(Repositories.scanAll) //scans them for code snippets that match signatures
.then(Code.saveAll) //saves those snippets, flagging the repos.
.then(function(saved){
_famousRepoValidityCheck(n+1)
});
//get all code snippets
//scan to see which onese actually have famous/core/engines
//ones that do flag 'FAMOUS_CONFIRM'
}
//gets repos from a starting date to a finish date
function promiseForGettingRepos(startDate, endDate){
return Repositories.getFromGithubForDateAndLanguage(startDate, 'javascript')
.then(function(repos){
debugger;
Users.saveAllFromRepos(repos).then(function(userIds){
debugger;
Repositories.saveAll(userIds, repos).then(function(repoIds) {
if(startDate.getDate() == endDate.getDate()) {
return 'DONE';
}else{
startDate.setDate(startDate.getDate()+1);
return promiseForGettingRepos(startDate, endDate);
}
}, _logErrorSaveRepo);
}, _logErrorSaveUser);
}, _logErrorGetRepo);
}
function _logErrorSaveCode(data){
console.log('ERROR SAVING CODE', data);
}
function _logErrorSearchUserForCode(data){
console.log('ERROR SEARCHING USER FOR CODE', data);
}
function _logErrorSaveRepo(data) {
console.log(('ERROR SAVING REPO ' + data).red);
}
function _logErrorSaveUser(data) {
console.log(('ERROR SAVING USER ' + data).red);
}
function _logErrorGetRepo(data) {
console.log(('ERROR GET REPO ' + data).red);
}
module.exports = Scraper;