-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
67 lines (59 loc) · 2.3 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import scraperwiki
import lxml.html
import uuid
import datetime
# Variables
wishlist = "255H4B66T7IBO"
limit = 1.50
asins = []
# Functions
def is_cheap(price):
if(price < float(limit)):
cheap = "Is going cheap!" # Text to search for within IFTTT
elif(price == 9999999):
cheap = "We can't get a price."
else:
cheap = "Is not going cheap."
return cheap
def save(title, link, description):
now = datetime.datetime.now()
data = {
"title": title,
"link": link,
"description": description,
"guid": link + "&uuid=" + str(uuid.uuid1()),
"pubDate": str(now)
}
scraperwiki.sqlite.save(unique_keys=['link'], data=data)
return "Saved!"
try: # Try and get ASINs from your Amazon Wish List
url = "http://www.amazon.co.uk/wishlist/" + wishlist + "/ref=cm_wl_act_print_o?_encoding=UTF8&disableNav=1&items-per-page=2500&layout=standard-print&page=1&visitor-view=1"
html = scraperwiki.scrape(url)
root = lxml.html.fromstring(html)
for element in root.cssselect("tbody[class='itemWrapper']"):
string = element.attrib["name"]
pieces = string.split(".")
asins.append(pieces[3])
except:
print "There was an error. Did you provide a working Amazon Wish List identifier?"
exit;
# Process ASINs for further details
for asin in asins:
price = 9999999
url = "http://www.amazon.co.uk/dp/" + asin
html = scraperwiki.scrape(url)
root = lxml.html.fromstring(html)
title = root.cssselect("span[id='btAsinTitle']")[0].text_content()
if(title.find("Kindle Edition") > 0): # Check if this is an item for the Amazon Kindle
for element in root.cssselect("div[class='buying'] input"):
if(element.attrib["name"] == "displayedPrice"):
price = float(element.attrib["value"])
cheap = is_cheap(price)
else:
try: # Try and get the price of the item. Some items throw back garbage so the price is listed as cannot be found.
price = float(root.cssselect("span[id='actualPriceValue'] b")[0].text_content()[1:])
except(IndexError):
print "We can't get a price for: " + title
print "Try camelcamelcamel: http://uk.camelcamelcamel.com/product/" + asin
cheap = is_cheap(price)
print save(title, url, cheap)