-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscan_recipe_site_info.rb
129 lines (115 loc) · 3.86 KB
/
scan_recipe_site_info.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env ruby
require 'nokogiri'
def panel_data panel
"#{panel_named(panel)}/../div[2]/ul/li"
end
def panel_named a
".//*[@class='panel-title' and contains(text(),'#{a}')]"
end
def process_html_files(files, &blk)
files.each do |f|
yield(Nokogiri::HTML(File.open(f,'r')))
end
end
def search_with_scope(artifact, context, &blk)
artifact.search(context).each do |subcontext|
yield(subcontext)
end
end
def search_for_text(artifact, text)
artifact.search("//*[contains(
translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
'abcdefghijklmnopqrstuvwxyz'), '#{text}')]")
end
def glob_rating_data(html)
data = {}
search_with_scope(html, '#rating') do |context|
if context.inner_text.match(/Used (\d+) times/i)
data['usage'] = $1
else
data['usage'] = data['points'] = "-1"
end
if context.inner_text.match(/average impact of (-?)(\d+?) points/i)
data['points'] = ($1 =~ /-/) ? "-#{$2}" : $1
else
data['points'] = "NODATA"
end
{'avg' => '.average-rating','votes' => '.total-votes' }.each do |key, elem|
search_with_scope(context, elem) do |subcontext|
data[key] = (key == 'avg')? subcontext.inner_text.split(' ').last : subcontext.inner_text.split(' ').first.gsub('(','')
end
end
end
data
end
def glob_auxiliary_data(html)
data = {}
search_with_scope(html, '.macro-right') do |context|
data['parent'] = (context.search(panel_data('Parent')).inner_text =~ /none/i) ? "0" : "1"
data['children'] = context.search(panel_data('Children')).size
data['authors'] = {}
search_with_scope(context, panel_data('Authors')) do |subcontext|
data['authors'][subcontext.inner_text.split(' ').first] = {
'solo' => subcontext.search('.schl-so').inner_text,
'evolve' => subcontext.search('.schl-ev').inner_text
}
end
end
search_with_scope(html, '.macro-left') do |context|
unless context.search('.best-for-list').empty?
data['best-for-list'] = context.search('.best-for-list').map {|e| e.inner_text }
end
end
data
end
def glob_human_data(html)
data = {}
strings = [
'beginning', 'start', 'middle', 'end', 'long', 'fast', 'quick', 'score', 'great',
'smart', 'bug', 'thank', 'slow', 'need', 'prerequisite', 'huge', 'initial', 'hours', 'forever',
'old', 'new','intermedia', 'early', 'late', 'improve', 'restore', 'fix', 'probe', 'test', 'stabili', 'longer',
'modifi', 'best', 'repear', 'recurs', 'team', 'compete', 'unfair', 'defect'
]
data['comments'] = html.search('#forum-comments .forum-post').size.to_s
data['sentiment_analysis'] = {}
strings.each do |s|
findings = search_for_text(html,s)
data['sentiment_analysis'][s] = {
:count => findings.size,
:context => findings.map { |f| f.inner_text }
}
end
data
end
def scrape_recipe_name(html)
name = ''
search_with_scope(html,'.macro-left') do |context|
name = context.search('.node-name').first.inner_text.strip
end
name
end
def log_processing_results(data)
File.open(data[:out],'a') do |f|
f.write("------------------------------------------------\n")
f.write("Recipe: #{data[:name].inspect}\n")
f.write("Rating: #{data[:rating].inspect}\n")
f.write("Context: #{data[:aux].inspect}\n")
f.write("Expressed: #{data[:human].inspect}\n")
f.write("#\n")
end
end
FILES_DIR = ARGV[0]
OUTPUT_LOG = ARGV[1] || './recipe-scan-log.log'
exit(-1) unless File.exist?(FILES_DIR) and File.directory?(FILES_DIR)
files = Dir.entries(FILES_DIR).map {|n| "#{FILES_DIR}/#{n}"}.reject {|f| File.directory?(f) }
`rm -f #{OUTPUT_LOG}` if File.exist?(OUTPUT_LOG)
process_html_files(files) do |html|
log_processing_results({
:name => scrape_recipe_name(html),
:rating => glob_rating_data(html),
:human => glob_human_data(html),
:aux => glob_auxiliary_data(html),
:out => OUTPUT_LOG
})
end
exit(0)