needleProject/parser.py at master · aviadlevy/needleProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import json
import sys
from time import sleep

import requests

try:
    from BeautifulSoup import BeautifulSoup
except ImportError:
    from bs4 import BeautifulSoup

BASE_URL = "http://allrecipes.com"


def str_to_min(ready_in_time_str):
    if "h" in ready_in_time_str:
        hour = int(ready_in_time_str.split("h")[0].strip())
        ready_in_time_str = ready_in_time_str.replace(str(hour), "").strip().replace("h", "").strip()
    else:
        hour = 0
    ready_in_time_str = ready_in_time_str.replace("m", "").strip()
    if not ready_in_time_str:
        min = 0
    else:
        min = int(ready_in_time_str)
    return hour * 60 + min


def parse_single_page(url_suff):
    res = requests.get(BASE_URL + url_suff)
    res_html = BeautifulSoup(res.text, "html.parser")
    # debugging
    # with open("examples/debug.html") as f:
    #     res = f.read()
    # res_html = BeautifulSoup(res, "html.parser")
    # title
    try:
        title = res_html.find("h1", itemprop="name").text
    except AttributeError:
        title = res_html.find("title").split("-").strip()
    # rating
    rating = float(res_html.find("meta", property="og:rating")["content"])
    # rating scale (max rating possible)
    rating_scale = float(res_html.find("meta", property="og:rating_scale")["content"])
    # total time (prep + cook)
    try:
        ready_in_time_str = res_html.find("span", {
            "class": "ready-in-time"
        }).text
        ready_in_time_min = str_to_min(ready_in_time_str.lower())
    except:
        ready_in_time_min = None
    servings = float(res_html.find("meta", id="metaRecipeServings")["content"])
    try:
        calories = float(res_html.find("span", {
        "class": "calorie-count"
    }).find("span").text)
    except:
        calories = None
    ingredients = []
    for col in res_html.find_all("ul", id=lambda x: x and x.startswith("lst_ingredients_")):
        for ing in col.find_all("li"):
            if "Add all ingredients to list" not in ing.text:
                ingredients.append(ing.text.replace("ADVERTISEMENT", "").strip())
    try:
        prep_time_str = res_html.find("time", itemprop="prepTime").text
        prep_time_min = str_to_min(prep_time_str.lower())
        cook_time_min = ready_in_time_min - prep_time_min
    except:
        prep_time_min = None
        cook_time_min = None
    directions = []
    for step in res_html.find_all("li", {
        "class": "step"
    }):
        if step.text:
            directions.append(step.text)
    url = BASE_URL + url_suff

    ret = {
        "title":        title,
        "rating":       rating,
        "rating_scale": rating_scale,
        "rating_ratio": rating / rating_scale,
        "servings":     servings,
        "calories":     calories,
        "ready_time":   ready_in_time_min,
        "prep_time":    prep_time_min,
        "cook_time":    cook_time_min,
        "ingredients":  ingredients,
        "directions":   directions,
        "url":          url
    }
    # debug
    # with open("examples/debug.json", "w") as f:
    #     json.dump(ret, f, indent=4)
    # print(ret)

    return ret


def get_recipes_from_page(page_num):
    res = requests.get(BASE_URL + "/recipes/?page=" + str(page_num))
    res_html = BeautifulSoup(res.text, "html.parser")
    recipes_list = []
    article_tags = res_html.find_all("article")
    for i in article_tags:
        try:
            if i.find("a")["href"].startswith("/recipe/"):
                recipes_list.append(i.find("a")["href"])
        except (KeyError, TypeError):
            pass
    return recipes_list


if __name__ == '__main__':
    # debug
    # parse_single_page("/recipe/45954/roast-sticky-chicken-rotisserie-style/")
    page_num = sys.argv[1]
    page_end = sys.argv[2]
    for i in range(int(page_num), int(page_end)):
        recipes_list = get_recipes_from_page(i)
        if not recipes_list:
            continue
        for page in recipes_list:
            ret = parse_single_page(page)
            with open("real_data", "a") as f:
                f.write(json.dumps(ret) + "\n")
            sleep(5)
        with open("savestate", "w") as f:
            f.write("last page parsed is: " + str(i))