1
+ import warnings
1
2
import xarray as xr
2
3
import regionmask
3
- import warnings
4
+ import pandas as pd
5
+
6
+ from datasets import load_dataset
4
7
5
8
from unileaf_util .framework .transformers .data_encoder import DataEncoder
6
9
7
10
from . import constants
8
11
9
12
class ELUCData ():
13
+ """
14
+ Object to automatically handle the processing of ELUC data.
15
+ Load with import_data() then process into a df with da_to_df().
16
+ Maintains train and test dataframes, encoder for data, and encoded versions of train and test.
17
+ """
10
18
11
19
def import_data (self , path , update_path ):
20
+ """
21
+ Reads in raw data and update data and processes them into an xarray.
22
+ Replace ELUC and cell_area columns with updated ones.
23
+ Shift diffs back a year so they align in our CAO POV.
24
+ Originally: land use for 2021, what changed from 2020-2021, ELUC for end of 2021
25
+ Now: land use for 2021, what changed from 2021-2022, ELUC for end of 2021
26
+ """
12
27
raw = None
13
28
# TODO: This is a bit of a hack because I'm not sure how to handle the dask warnings
14
29
with warnings .catch_warnings ():
15
30
warnings .simplefilter ("ignore" )
16
31
raw = xr .open_zarr (path , consolidated = True , chunks = "auto" )
17
32
18
33
# Get updated ELUC
19
- if update_path :
20
- eluc = xr .open_dataset (update_path )
21
- raw = raw .drop_vars (["ELUC" , "cell_area" ])
22
- raw = raw .merge (eluc )
34
+ eluc = xr .open_dataset (update_path )
35
+ raw = raw .drop_vars (["ELUC" , "cell_area" ])
36
+ raw = raw .merge (eluc )
23
37
24
38
# Shift actions back a year
25
39
raw_diffs = ['c3ann' , 'c3nfx' , 'c3per' ,'c4ann' , 'c4per' , 'pastr' , 'primf' , 'primn' , 'range' , 'secdf' , 'secdn' , 'urban' ]
26
40
raw_diffs = [f"{ col } _diff" for col in raw_diffs ]
27
41
raw [raw_diffs ] = raw [raw_diffs ].shift (time = - 1 )
28
42
29
- # Old time shifting
30
- # raw['ELUC'] = raw['ELUC'].shift(time=1)
31
- # raw['ELUC_diff'] = raw['ELUC_diff'].shift(time=1)
32
- # raw['time'] = raw.time - 1
33
- # assert(list(np.unique(raw.time)) == list(range(1849, 2022)))
34
- # mask = raw["ELUC_diff"].isnull().compute()
35
- # raw = raw.where(~mask, drop=True)
36
-
43
+ # I'm not entirely sure what this does but I'm scared to remove it
37
44
country_mask = regionmask .defined_regions .natural_earth_v5_0_0 .countries_110 .mask (raw )
38
45
raw ["country" ] = country_mask
39
46
return raw
40
47
41
- def __init__ (self , path , update_path , start_year = 1851 , test_year = 2012 , end_year = 2022 , countries = None , merge_crop = False ):
48
+
49
+ def hf_to_df (self , hf_repo ):
50
+ """
51
+ Loads dataset from huggingface, converts to pandas, then sets indices appropriately to time/lat/lon.
52
+ Keep old time/lat/lon columns so we can use them as features later.
53
+ """
54
+ ds = load_dataset (hf_repo )["train" ]
55
+ df = ds .to_pandas ()
56
+ df = df .set_index (["time" , "lat" , "lon" ], drop = False )
57
+ return df
58
+
59
+
60
+ def __init__ (self , path : str , update_path = None , start_year = 1851 , test_year = 2012 , end_year = 2022 , countries = None , merge_crop = False ):
61
+ """
62
+ If update_path is given, load raw data the old way using 2 files that are merged.
63
+ Otherwise, path is taken to be a huggingface repo and we load the data from there.
64
+ """
42
65
assert start_year < test_year and test_year < end_year
43
- raw = self .import_data (path , update_path )
44
- df = self .da_to_df (raw , start_year , end_year , countries , merge_crop )
45
- self .train_df = df .loc [:test_year ]
46
- self .test_df = df .loc [test_year :]
66
+
67
+ if update_path :
68
+ raw = self .import_data (path , update_path )
69
+ df = self .da_to_df (raw , start_year , end_year , countries , merge_crop )
70
+
71
+ else :
72
+ df = self .hf_to_df (path )
73
+ if countries :
74
+ df = self .subset_countries (df , countries )
75
+
76
+ self .train_df = df .loc [start_year :test_year ]
77
+ self .test_df = df .loc [test_year :end_year ]
47
78
48
79
self .encoder = DataEncoder (self .get_fields (), constants .CAO_MAPPING )
49
80
self .encoded_train_df = None
@@ -58,7 +89,19 @@ def subset_countries(self, df, countries):
58
89
return df [df ["country" ].isin (idx )].copy ()
59
90
60
91
61
- def da_to_df (self , da , start_year = None , end_year = None , countries = None , merge_crop = False ):
92
+ def da_to_df (self , da : xr .DataArray , start_year = None , end_year = None , countries = None , merge_crop = False ) -> pd .DataFrame :
93
+ """
94
+ Converts an xarray DataArray to a pandas DataFrame.
95
+ Duplicates indices into columns so we can use them as features.
96
+ Adds country name column for easier access.
97
+ :param da: xarray DataArray to convert.
98
+ :param start_year: Year to start at (inclusive)
99
+ :param end_year: Year to end at (uninclusive)
100
+ :param countries: List of country abbreviations to subset by
101
+ :param merge_crop: Whether to merge crop columns into one column.
102
+ (Note: Still leaves crop types untouched, just adds merged crop column)
103
+ :return: pandas DataFrame
104
+ """
62
105
df = da .to_dataframe ()
63
106
df = df .dropna ()
64
107
@@ -85,7 +128,10 @@ def da_to_df(self, da, start_year=None, end_year=None, countries=None, merge_cro
85
128
return df
86
129
87
130
88
- def get_fields (self ):
131
+ def get_fields (self ) -> dict :
132
+ """
133
+ Creates fields json object for the data encoder/prescriptor.
134
+ """
89
135
fields_df = self .train_df [constants .CAO_MAPPING ["context" ] + constants .CAO_MAPPING ["actions" ] + ["ELUC" ]].astype ("float64" )
90
136
fields = dict ()
91
137
# TODO: Right now this doesn't work because we don't have separate CAO mappings for merged and not merged crops
0 commit comments