-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgeolookup_data.py
More file actions
127 lines (98 loc) · 3.86 KB
/
geolookup_data.py
File metadata and controls
127 lines (98 loc) · 3.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
import argparse
import collections
from datetime import datetime
import geopy
from geopy.geocoders import Nominatim
import math
import os
from pprint import pprint
import re
import sys
import time
import lib.io_utils as io
import lib.list_utils as lu
import lib.math_utils as mu
# input
parser = argparse.ArgumentParser()
parser.add_argument('-in', dest="INPUT_FILE", default="data/MexicoAndCentralAmerica_cleaned.csv", help="File generated by clean_data.py")
parser.add_argument('-useragent', dest="USER_AGENT_STRING", default="amnh-anthro-collections/1.0 (bfoo@amnh.org)", help="User agent string for identifying your application to Nominatim")
parser.add_argument('-cache', dest="CACHE_FILE", default="data/processed/MexicoAndCentralAmerica_locales.csv", help="A file to save the progress of locale lookup")
parser.add_argument('-out', dest="OUTPUT_FILE", default="data/processed/MexicoAndCentralAmerica_geocoded.csv", help="Output csv file")
parser.add_argument('-wait', dest="WAIT_SECONDS", default=5, type=int, help="Seconds to wait before each request")
a = parser.parse_args()
LOCALE_FIELD = "Locale"
# Make sure output dirs exist
io.makeDirectories([a.OUTPUT_FILE, a.CACHE_FILE])
fieldNames, items = io.readCsv(a.INPUT_FILE)
itemCount = len(items)
for i, item in enumerate(items):
items[i]["LookupString"] = ""
# only lookup items that have country and locale with values
if len(item["Country"]) < 1 or len(item[LOCALE_FIELD]) < 1:
continue
items[i]["LookupString"] = item[LOCALE_FIELD] + ", " + item["Country"]
locales = []
localeLookup = {}
if os.path.isfile(a.CACHE_FILE):
_, locales = io.readCsv(a.CACHE_FILE)
localeLookup = lu.createLookup(locales, "LookupString")
if "GeoName" not in fieldNames:
fieldNames.append("GeoName")
values = [item["LookupString"] for item in items]
counter = collections.Counter(values)
counts = counter.most_common()
total = len(counts)
geolocator = Nominatim(user_agent=a.USER_AGENT_STRING)
for i, c in enumerate(counts):
value, count = c
if len(str(value).strip()) < 1:
continue
if value in localeLookup:
print("Already found %s" % value)
continue
try:
print("Looking up %s..." % value)
location = geolocator.geocode(value)
except geopy.exc.GeocoderTimedOut:
print("Geopy error; skipping...")
location = None
row = {
"LookupString": value,
"Latitude": 0,
"Longitude": 0,
"GeoName": ""
}
if location is None:
print("Could not find %s" % value)
else:
print("Found %s" % location.address)
row["GeoName"] = location.address
row["Latitude"] = location.latitude
row["Longitude"] = location.longitude
# print(location.address)
# print((location.latitude, location.longitude))
# pprint(location.raw)
locales.append(row)
# Save progress
io.writeCsv(a.CACHE_FILE, locales, ["LookupString", "Latitude", "Longitude", "GeoName"])
print("Progress: %s%%" % round(1.0 * (i+1) / total * 100, 2))
localeLookup[value] = row
time.sleep(a.WAIT_SECONDS)
# if i > 5:
# break
localeCount = len(locales)
print("%s Locales in total" % localeCount)
geocodedCount = len([l for l in locales if len(l["GeoName"]) > 0])
print("%s (%s%%) successfully geocoded" % (geocodedCount, round(1.0 * geocodedCount / localeCount * 100, 2)))
# Add geo to items
for i, item in enumerate(items):
lookupString = item["LookupString"]
if len(lookupString) > 0 and lookupString in localeLookup:
geo = localeLookup[lookupString]
if len(geo["GeoName"]) > 0:
items[i]["GeoName"] = geo["GeoName"]
if not (geo["Latitude"] == 0 and geo["Longitude"] == 0):
items[i]["Latitude"] = geo["Latitude"]
items[i]["Longitude"] = geo["Longitude"]
io.writeCsv(a.OUTPUT_FILE, items, fieldNames)