MRU_Streamlit_Scrap_APIrequest_dataViz/rungps.py at main · ri-gh/MRU_Streamlit_Scrap_APIrequest_dataViz · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
import pandas as pd
import scrapy
from scrapy import cmdline
from scrapy.crawler import CrawlerProcess
import os
import logging

a = pd.read_json('src/All_Booking_mru.json')
all_details = pd.DataFrame(a)
list_of_urls = []
for url in (all_details['url']):
    list_of_urls.append(url)


def main():
    cmdline.execute('scrapy crawl BookingAllGeoSpider'.split())


def load_data():
# we scrap lat & lon for every hotel of the df:
    class BookingAllGeoSpider(scrapy.Spider):
        # Name of your spider
        name = "allgeobookingmru"
        def start_requests(self):

            # Starting URL

            for url in list_of_urls:
                    yield scrapy.Request(url=url, callback=self.parse)

        def parse(self, response):

                results = response.css('#hotel_address') #on appelle le resultat via son id qui est unique
                for r in results:
                    yield {
                        'lat_lon': r.css('::attr(data-atlas-latlng)').get()}


                # Name of the file where the results will be saved
    filename1 = "hotel_coord_mru.json"

            # If file already exists, delete it before crawling (because Scrapy will concatenate the last and new results otherwise)
    if filename1 in os.listdir('src/'):
                    os.remove('src/' + filename1)

    # Declare a new CrawlerProcess with some settings
    process = CrawlerProcess(settings = {
        'USER_AGENT': 'Chrome/84.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'LOG_LEVEL': logging.INFO,
        "FEEDS": {
            'src/' + filename1 : {"format": "json"},
                }
            })

            # Start the crawling using the spider you defined above

    process.crawl(BookingAllGeoSpider)
    process.start()

    datahotelgps =pd.read_json('src/hotel_coord_mru.json')

    return datahotelgps

datahotelgps = load_data()