-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrungps.py
More file actions
65 lines (47 loc) · 2 KB
/
rungps.py
File metadata and controls
65 lines (47 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
import pandas as pd
import scrapy
from scrapy import cmdline
from scrapy.crawler import CrawlerProcess
import os
import logging
a = pd.read_json('src/All_Booking_mru.json')
all_details = pd.DataFrame(a)
list_of_urls = []
for url in (all_details['url']):
list_of_urls.append(url)
def main():
cmdline.execute('scrapy crawl BookingAllGeoSpider'.split())
def load_data():
# we scrap lat & lon for every hotel of the df:
class BookingAllGeoSpider(scrapy.Spider):
# Name of your spider
name = "allgeobookingmru"
def start_requests(self):
# Starting URL
for url in list_of_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
results = response.css('#hotel_address') #on appelle le resultat via son id qui est unique
for r in results:
yield {
'lat_lon': r.css('::attr(data-atlas-latlng)').get()}
# Name of the file where the results will be saved
filename1 = "hotel_coord_mru.json"
# If file already exists, delete it before crawling (because Scrapy will concatenate the last and new results otherwise)
if filename1 in os.listdir('src/'):
os.remove('src/' + filename1)
# Declare a new CrawlerProcess with some settings
process = CrawlerProcess(settings = {
'USER_AGENT': 'Chrome/84.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'LOG_LEVEL': logging.INFO,
"FEEDS": {
'src/' + filename1 : {"format": "json"},
}
})
# Start the crawling using the spider you defined above
process.crawl(BookingAllGeoSpider)
process.start()
datahotelgps =pd.read_json('src/hotel_coord_mru.json')
return datahotelgps
datahotelgps = load_data()