-
Notifications
You must be signed in to change notification settings - Fork 6.3k
/
Copy pathweb_to_gcs.py
66 lines (51 loc) · 2.07 KB
/
web_to_gcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import io
import os
import requests
import pandas as pd
from google.cloud import storage
"""
Pre-reqs:
1. `pip install pandas pyarrow google-cloud-storage`
2. Set GOOGLE_APPLICATION_CREDENTIALS to your project/service-account key
3. Set GCP_GCS_BUCKET as your bucket or change default value of BUCKET
"""
# services = ['fhv','green','yellow']
init_url = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/'
# switch out the bucketname
BUCKET = os.environ.get("GCP_GCS_BUCKET", "dtc-data-lake-bucketname")
def upload_to_gcs(bucket, object_name, local_file):
"""
Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
"""
# # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
# # (Ref: https://github.com/googleapis/python-storage/issues/74)
# storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024 # 5 MB
# storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024 # 5 MB
client = storage.Client()
bucket = client.bucket(bucket)
blob = bucket.blob(object_name)
blob.upload_from_filename(local_file)
def web_to_gcs(year, service):
for i in range(12):
# sets the month part of the file_name string
month = '0'+str(i+1)
month = month[-2:]
# csv file_name
file_name = f"{service}_tripdata_{year}-{month}.csv.gz"
# download it using requests via a pandas df
request_url = f"{init_url}{service}/{file_name}"
r = requests.get(request_url)
open(file_name, 'wb').write(r.content)
print(f"Local: {file_name}")
# read it back into a parquet file
df = pd.read_csv(file_name, compression='gzip')
file_name = file_name.replace('.csv.gz', '.parquet')
df.to_parquet(file_name, engine='pyarrow')
print(f"Parquet: {file_name}")
# upload it to gcs
upload_to_gcs(BUCKET, f"{service}/{file_name}", file_name)
print(f"GCS: {service}/{file_name}")
web_to_gcs('2019', 'green')
web_to_gcs('2020', 'green')
# web_to_gcs('2019', 'yellow')
# web_to_gcs('2020', 'yellow')