Skip to content

Commit 0e31162

Browse files
committed
Updated data to work with huggingface, refactored linreg and random forest to contain features
1 parent f9c8277 commit 0e31162

File tree

11 files changed

+407
-56
lines changed

11 files changed

+407
-56
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,6 @@ dmypy.json
140140

141141
.DS_Store
142142
.idea/
143+
144+
# vscode
145+
.vscode

use_cases/eluc/.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
*.joblib
2+
*.pt
3+
predictors/**/*.json

use_cases/eluc/data/constants.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
from pathlib import Path
2+
13
import pandas as pd
24
import regionmask
3-
from pathlib import Path
45

56
# TODO: This has to be changed for your local machine
67
ROOT_DIR = Path("/Users/964840/workspace/mvp/use_cases/eluc")
@@ -44,7 +45,7 @@
4445
"J": 388,
4546
"NA": 516,
4647
"PAL": 275,
47-
"J": 400,
48+
# "J": 400,
4849
"IRQ": 368,
4950
"IND": 356,
5051
"IRN": 364,

use_cases/eluc/data/data.py

+66-20
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,80 @@
1+
import warnings
12
import xarray as xr
23
import regionmask
3-
import warnings
4+
import pandas as pd
5+
6+
from datasets import load_dataset
47

58
from unileaf_util.framework.transformers.data_encoder import DataEncoder
69

710
from . import constants
811

912
class ELUCData():
13+
"""
14+
Object to automatically handle the processing of ELUC data.
15+
Load with import_data() then process into a df with da_to_df().
16+
Maintains train and test dataframes, encoder for data, and encoded versions of train and test.
17+
"""
1018

1119
def import_data(self, path, update_path):
20+
"""
21+
Reads in raw data and update data and processes them into an xarray.
22+
Replace ELUC and cell_area columns with updated ones.
23+
Shift diffs back a year so they align in our CAO POV.
24+
Originally: land use for 2021, what changed from 2020-2021, ELUC for end of 2021
25+
Now: land use for 2021, what changed from 2021-2022, ELUC for end of 2021
26+
"""
1227
raw = None
1328
# TODO: This is a bit of a hack because I'm not sure how to handle the dask warnings
1429
with warnings.catch_warnings():
1530
warnings.simplefilter("ignore")
1631
raw = xr.open_zarr(path, consolidated=True, chunks="auto")
1732

1833
# Get updated ELUC
19-
if update_path:
20-
eluc = xr.open_dataset(update_path)
21-
raw = raw.drop_vars(["ELUC", "cell_area"])
22-
raw = raw.merge(eluc)
34+
eluc = xr.open_dataset(update_path)
35+
raw = raw.drop_vars(["ELUC", "cell_area"])
36+
raw = raw.merge(eluc)
2337

2438
# Shift actions back a year
2539
raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', 'pastr', 'primf', 'primn', 'range', 'secdf', 'secdn', 'urban']
2640
raw_diffs = [f"{col}_diff" for col in raw_diffs]
2741
raw[raw_diffs] = raw[raw_diffs].shift(time=-1)
2842

29-
# Old time shifting
30-
# raw['ELUC'] = raw['ELUC'].shift(time=1)
31-
# raw['ELUC_diff'] = raw['ELUC_diff'].shift(time=1)
32-
# raw['time'] = raw.time - 1
33-
# assert(list(np.unique(raw.time)) == list(range(1849, 2022)))
34-
# mask = raw["ELUC_diff"].isnull().compute()
35-
# raw = raw.where(~mask, drop=True)
36-
43+
# I'm not entirely sure what this does but I'm scared to remove it
3744
country_mask = regionmask.defined_regions.natural_earth_v5_0_0.countries_110.mask(raw)
3845
raw["country"] = country_mask
3946
return raw
4047

41-
def __init__(self, path, update_path, start_year=1851, test_year=2012, end_year=2022, countries=None, merge_crop=False):
48+
49+
def hf_to_df(self, hf_repo):
50+
"""
51+
Loads dataset from huggingface, converts to pandas, then sets indices appropriately to time/lat/lon.
52+
Keep old time/lat/lon columns so we can use them as features later.
53+
"""
54+
ds = load_dataset(hf_repo)["train"]
55+
df = ds.to_pandas()
56+
df = df.set_index(["time", "lat", "lon"], drop=False)
57+
return df
58+
59+
60+
def __init__(self, path: str, update_path=None, start_year=1851, test_year=2012, end_year=2022, countries=None, merge_crop=False):
61+
"""
62+
If update_path is given, load raw data the old way using 2 files that are merged.
63+
Otherwise, path is taken to be a huggingface repo and we load the data from there.
64+
"""
4265
assert start_year < test_year and test_year < end_year
43-
raw = self.import_data(path, update_path)
44-
df = self.da_to_df(raw, start_year, end_year, countries, merge_crop)
45-
self.train_df = df.loc[:test_year]
46-
self.test_df = df.loc[test_year:]
66+
67+
if update_path:
68+
raw = self.import_data(path, update_path)
69+
df = self.da_to_df(raw, start_year, end_year, countries, merge_crop)
70+
71+
else:
72+
df = self.hf_to_df(path)
73+
if countries:
74+
df = self.subset_countries(df, countries)
75+
76+
self.train_df = df.loc[start_year:test_year]
77+
self.test_df = df.loc[test_year:end_year]
4778

4879
self.encoder = DataEncoder(self.get_fields(), constants.CAO_MAPPING)
4980
self.encoded_train_df = None
@@ -58,7 +89,19 @@ def subset_countries(self, df, countries):
5889
return df[df["country"].isin(idx)].copy()
5990

6091

61-
def da_to_df(self, da, start_year=None, end_year=None, countries=None, merge_crop=False):
92+
def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=None, merge_crop=False) -> pd.DataFrame:
93+
"""
94+
Converts an xarray DataArray to a pandas DataFrame.
95+
Duplicates indices into columns so we can use them as features.
96+
Adds country name column for easier access.
97+
:param da: xarray DataArray to convert.
98+
:param start_year: Year to start at (inclusive)
99+
:param end_year: Year to end at (uninclusive)
100+
:param countries: List of country abbreviations to subset by
101+
:param merge_crop: Whether to merge crop columns into one column.
102+
(Note: Still leaves crop types untouched, just adds merged crop column)
103+
:return: pandas DataFrame
104+
"""
62105
df = da.to_dataframe()
63106
df = df.dropna()
64107

@@ -85,7 +128,10 @@ def da_to_df(self, da, start_year=None, end_year=None, countries=None, merge_cro
85128
return df
86129

87130

88-
def get_fields(self):
131+
def get_fields(self) -> dict:
132+
"""
133+
Creates fields json object for the data encoder/prescriptor.
134+
"""
89135
fields_df = self.train_df[constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]].astype("float64")
90136
fields = dict()
91137
# TODO: Right now this doesn't work because we don't have separate CAO mappings for merged and not merged crops
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,42 @@
11
import joblib
2+
import os
3+
import json
4+
25
from sklearn.linear_model import LinearRegression
36

4-
from predictor import Predictor
7+
from predictors.predictor import Predictor
58

69
class LinearRegressionPredictor(Predictor):
710
"""
811
Simple linear regression predictor.
912
"""
10-
def __init__(self, **kwargs):
13+
def __init__(self, features=None, **kwargs):
14+
self.features = features
1115
self.model = LinearRegression(**kwargs)
1216

13-
def save(self, path):
14-
joblib.dump(self.model, path)
17+
def save(self, path: str):
18+
"""
19+
Saves saves model and features into format for loading.
20+
Generates path to folder if it does not exist.
21+
:param path: path to folder to save model files.
22+
"""
23+
os.makedirs(path, exist_ok=True)
24+
config = {
25+
"features": self.features,
26+
}
27+
json.dump(config, open(os.path.join(path, "config.json"), "w"))
28+
joblib.dump(self.model, os.path.join(path, "model.joblib"))
1529

1630
def load(self, path):
17-
self.model = joblib.load(path)
31+
self.features = json.load(open(os.path.join(path, "config.json")))["features"]
32+
self.model = joblib.load(os.path.join(path, "model.joblib"))
1833

1934
def fit(self, X_train, y_train):
35+
if self.features:
36+
X_train = X_train[self.features]
2037
self.model.fit(X_train, y_train)
2138

2239
def predict(self, X_test):
40+
if self.features:
41+
X_test = X_test[self.features]
2342
return self.model.predict(X_test)

use_cases/eluc/predictors/NeuralNetwork/NeuralNetPredictor.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from torch.utils.data import Dataset, DataLoader
1414
from torch.utils.tensorboard import SummaryWriter
1515

16-
from predictor import Predictor
16+
from predictors.predictor import Predictor
1717

1818
class CustomDS(Dataset):
1919
"""
@@ -269,4 +269,9 @@ def predict(self, X_test: pd.DataFrame) -> np.array:
269269
pred_list.append(self.model(X).squeeze())
270270

271271
y_pred = torch.concatenate(pred_list, dim=0).cpu().numpy()
272-
return y_pred
272+
return y_pred
273+
274+
if __name__ == "__main__":
275+
print("a")
276+
print("b")
277+
print("c")
Binary file not shown.
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,42 @@
11
import joblib
2+
import os
3+
import json
4+
25
from sklearn.ensemble import RandomForestRegressor
36

4-
from predictor import Predictor
7+
from predictors.predictor import Predictor
58

69
class LinearRegressionPredictor(Predictor):
710
"""
811
Simple linear regression predictor.
912
"""
10-
def __init__(self, **kwargs):
13+
def __init__(self, features=None, **kwargs):
14+
self.features = features
1115
self.model = RandomForestRegressor(**kwargs)
1216

13-
def save(self, path):
14-
joblib.dump(self.model, path)
17+
def save(self, path: str):
18+
"""
19+
Saves saves model and features into format for loading.
20+
Generates path to folder if it does not exist.
21+
:param path: path to folder to save model files.
22+
"""
23+
os.makedirs(path, exist_ok=True)
24+
config = {
25+
"features": self.features,
26+
}
27+
json.dump(config, open(os.path.join(path, "config.json"), "w"))
28+
joblib.dump(self.model, os.path.join(path, "model.joblib"))
1529

1630
def load(self, path):
17-
self.model = joblib.load(path)
31+
self.features = json.load(open(os.path.join(path, "config.json")))["features"]
32+
self.model = joblib.load(os.path.join(path, "model.joblib"))
1833

1934
def fit(self, X_train, y_train):
35+
if self.features:
36+
X_train = X_train[self.features]
2037
self.model.fit(X_train, y_train)
2138

2239
def predict(self, X_test):
40+
if self.features:
41+
X_test = X_test[self.features]
2342
return self.model.predict(X_test)

0 commit comments

Comments
 (0)