From 327c649a045e61346ffb26ac6a0ea011477062c0 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 18 Jun 2025 13:13:59 +0100 Subject: [PATCH 01/18] rapids togo labels --- .../rapids_togo/create_windows_for_labels.py | 209 ++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 rslp/crop/rapids_togo/create_windows_for_labels.py diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py new file mode 100644 index 00000000..a24d0274 --- /dev/null +++ b/rslp/crop/rapids_togo/create_windows_for_labels.py @@ -0,0 +1,209 @@ +"""Create windows for crop type mapping. + +Data from https://zenodo.org/records/3836629 +""" + +import argparse +import hashlib +import multiprocessing +from datetime import datetime, timezone + +import geopandas +import pandas as pd +import shapely +import tqdm +from rslearn.const import WGS84_PROJECTION +from rslearn.dataset import Window +from rslearn.utils import Projection, STGeometry, get_utm_ups_crs +from rslearn.utils.feature import Feature +from rslearn.utils.mp import star_imap_unordered +from rslearn.utils.vector_format import GeojsonVectorFormat +from upath import UPath + +WINDOW_RESOLUTION = 10 +LABEL_LAYER = "label" + +# data was collected in May 2020, so we consider the 6 months before and after may +START_TIME = datetime(2019, 12, 1, tzinfo=timezone.utc) +END_TIME = datetime(2020, 12, 31, tzinfo=timezone.utc) + + +def process_files(shapefile_path: UPath) -> pd.DataFrame: + """Create windows for crop type mapping. + + Args: + csv_path: path to the csv file + num_pixels: number of points to sample from each polygon + postprocess_categories: whether to postprocess categories + """ + df = geopandas.read_file(shapefile_path) + is_crop = 1 + if "non" in shapefile_path.name.lower(): + is_crop = 0 + + df["is_crop"] = is_crop + + df["longitude"] = df.geometry.centroid.x + df["latitude"] = df.geometry.centroid.y + + df["org_file"] = shapefile_path.name + df.reset_index() + df["unique_id"] = df.apply(lambda x: f"{x.name}-{x.org_file}", axis=1) + + return df[["is_crop", "geometry", "latitude", "longitude", "org_file", "unique_id"]] + + +def create_window( + csv_row: pd.Series, + ds_path: UPath, + group_name: str, + window_size: int, +) -> None: + """Create windows for crop type mapping. + + Args: + csv_row: a row of the dataframe + ds_path: path to the dataset + group_name: name of the group + window_size: window size + """ + # Get sample metadata + polygon_id = csv_row["unique_id"] + latitude, longitude = csv_row["latitude"], csv_row["longitude"] + is_crop = csv_row["is_crop"] + category = is_crop + + src_point = shapely.Point(longitude, latitude) + src_geometry = STGeometry(WGS84_PROJECTION, src_point, None) + dst_crs = get_utm_ups_crs(longitude, latitude) + dst_projection = Projection(dst_crs, WINDOW_RESOLUTION, -WINDOW_RESOLUTION) + dst_geometry = src_geometry.to_projection(dst_projection) + + # This is specific for window size = 1. + if window_size == 1: + bounds = ( + int(dst_geometry.shp.x), + int(dst_geometry.shp.y) - window_size, + int(dst_geometry.shp.x) + window_size, + int(dst_geometry.shp.y), + ) + else: + bounds = ( + int(dst_geometry.shp.x), + int(dst_geometry.shp.y), + int(dst_geometry.shp.x) + window_size // 2, + int(dst_geometry.shp.y) + window_size // 2, + ) + + # Check if train or val. + group = f"{group_name}_window_{window_size}" + window_name = f"{polygon_id}_{latitude}_{longitude}" + + # If split by polygon id, no samples from the same polygon will be in the same split. + is_val = hashlib.md5(str(window_name).encode()).hexdigest()[0] in ["0", "1"] + + if is_val: + split = "val" + else: + split = "train" + + window = Window( + path=Window.get_window_root(ds_path, group, window_name), + group=group, + name=window_name, + projection=dst_projection, + bounds=bounds, + time_range=(START_TIME, END_TIME), + options={ + "split": split, + "is_crop": is_crop, + "category": category, + }, + ) + window.save() + + # Add the label. + feature = Feature( + window.get_geometry(), + { + "category": category, + }, + ) + layer_dir = window.get_layer_dir(LABEL_LAYER) + GeojsonVectorFormat().encode_vector(layer_dir, [feature]) + window.mark_layer_completed(LABEL_LAYER) + + +def create_windows_from_csv( + csv_paths: UPath, + ds_path: UPath, + group_name: str, + window_size: int, +) -> None: + """Create windows from csv. + + Args: + csv_path: path to the csv file + ds_path: path to the dataset + group_name: name of the group + window_size: window size + """ + for filename in ["crop_merged_v2", "noncrop_merged_v2"]: + df_sampled = process_files(csv_paths / filename) + csv_rows = [] + for _, row in df_sampled.iterrows(): + csv_rows.append(row) + + jobs = [ + dict( + csv_row=row, + ds_path=ds_path, + group_name=group_name, + window_size=window_size, + ) + for row in csv_rows + ] + p = multiprocessing.Pool(32) + outputs = star_imap_unordered(p, create_window, jobs) + for _ in tqdm.tqdm(outputs, total=len(jobs)): + pass + p.close() + + +if __name__ == "__main__": + + multiprocessing.set_start_method("forkserver") + parser = argparse.ArgumentParser(description="Create windows from csv") + parser.add_argument( + "--csv_paths", + type=str, + required=True, + help="Path to the csv file", + ) + parser.add_argument( + "--ds_path", + type=str, + required=True, + help="Path to the dataset", + ) + parser.add_argument( + "--group_name", + type=str, + required=False, + help="Name of the group", + default="groundtruth", + ) + parser.add_argument( + "--window_size", + type=int, + required=False, + help="Window size", + default=1, + ) + args = parser.parse_args() + create_windows_from_csv( + UPath(args.csv_paths), + UPath(args.ds_path), + args.group_name, + window_size=args.window_size, + ) From 3a79a54c4ee2217b85f47a80e69026bd5f3dc021 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 18 Jun 2025 13:31:37 +0100 Subject: [PATCH 02/18] Add geopandas to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7358718e..d46ef940 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ ruff>=0.7 scikit-image>=0.23 typing-extensions>=4.11 uvicorn>=0.32 +geopandas>=1.1.0 From 492a2316e92c64856ce4ecf5160565dba3039ced Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 1 Jul 2025 14:39:42 -0700 Subject: [PATCH 03/18] use csvs instead of shapefiles --- .../rapids_togo/create_windows_for_labels.py | 31 ++------------- rslp/crop/rapids_togo/to_csv.py | 38 +++++++++++++++++++ 2 files changed, 41 insertions(+), 28 deletions(-) create mode 100644 rslp/crop/rapids_togo/to_csv.py diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py index a24d0274..72c6b03e 100644 --- a/rslp/crop/rapids_togo/create_windows_for_labels.py +++ b/rslp/crop/rapids_togo/create_windows_for_labels.py @@ -28,31 +28,6 @@ END_TIME = datetime(2020, 12, 31, tzinfo=timezone.utc) -def process_files(shapefile_path: UPath) -> pd.DataFrame: - """Create windows for crop type mapping. - - Args: - csv_path: path to the csv file - num_pixels: number of points to sample from each polygon - postprocess_categories: whether to postprocess categories - """ - df = geopandas.read_file(shapefile_path) - is_crop = 1 - if "non" in shapefile_path.name.lower(): - is_crop = 0 - - df["is_crop"] = is_crop - - df["longitude"] = df.geometry.centroid.x - df["latitude"] = df.geometry.centroid.y - - df["org_file"] = shapefile_path.name - df.reset_index() - df["unique_id"] = df.apply(lambda x: f"{x.name}-{x.org_file}", axis=1) - - return df[["is_crop", "geometry", "latitude", "longitude", "org_file", "unique_id"]] - - def create_window( csv_row: pd.Series, ds_path: UPath, @@ -148,8 +123,8 @@ def create_windows_from_csv( group_name: name of the group window_size: window size """ - for filename in ["crop_merged_v2", "noncrop_merged_v2"]: - df_sampled = process_files(csv_paths / filename) + for filename in ["crop_merged_v2.csv", "noncrop_merged_v2.csv"]: + df_sampled = pd.read_csv(csv_paths / filename) csv_rows = [] for _, row in df_sampled.iterrows(): csv_rows.append(row) @@ -177,7 +152,7 @@ def create_windows_from_csv( parser.add_argument( "--csv_paths", type=str, - required=True, + default="gs://ai2-helios-us-central1/evaluations/crop_type_mapping/togo_2020", help="Path to the csv file", ) parser.add_argument( diff --git a/rslp/crop/rapids_togo/to_csv.py b/rslp/crop/rapids_togo/to_csv.py new file mode 100644 index 00000000..20b5e90a --- /dev/null +++ b/rslp/crop/rapids_togo/to_csv.py @@ -0,0 +1,38 @@ +""" +It's easier for google cloud if the files are in a single csv instead of +in a shapefile, so we process it into csvs. +""" +from upath import UPath +import pandas as pd +import geopandas + + +def process_files(shapefile_path: UPath) -> pd.DataFrame: + """Create windows for crop type mapping. + + Args: + csv_path: path to the csv file + num_pixels: number of points to sample from each polygon + postprocess_categories: whether to postprocess categories + """ + df = geopandas.read_file(shapefile_path) + is_crop = 1 + if "non" in shapefile_path.name.lower(): + is_crop = 0 + + df["is_crop"] = is_crop + + df["longitude"] = df.geometry.centroid.x + df["latitude"] = df.geometry.centroid.y + + df["org_file"] = shapefile_path.name + df.reset_index() + df["unique_id"] = df.apply(lambda x: f"{x.name}-{x.org_file}", axis=1) + + return df[["is_crop", "latitude", "longitude", "org_file", "unique_id"]] + + +if __name__ == "__main__": + for filename in ["crop_merged_v2", "noncrop_merged_v2"]: + df = process_files(UPath(filename)) + df.to_csv(f"{UPath(filename).stem}.csv") From 8fd616cb5d58493701798b30f9f7d9630ef4b8c1 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 1 Jul 2025 15:05:56 -0700 Subject: [PATCH 04/18] Add config and finetuning yaml for Togo 2020 --- data/helios/togo_2020/config.json | 126 ++++++++ data/helios/togo_2020/finetune_12_months.yaml | 271 ++++++++++++++++++ 2 files changed, 397 insertions(+) create mode 100644 data/helios/togo_2020/config.json create mode 100644 data/helios/togo_2020/finetune_12_months.yaml diff --git a/data/helios/togo_2020/config.json b/data/helios/togo_2020/config.json new file mode 100644 index 00000000..8f9cd926 --- /dev/null +++ b/data/helios/togo_2020/config.json @@ -0,0 +1,126 @@ +{ + "layers": { + "label": { + "type": "vector" + }, + "sentinel1_ascending": { + "band_sets": [ + { + "bands": [ + "vv", + "vh" + ], + "dtype": "float32" + } + ], + "data_source": { + "cache_dir": "cache/planetary_computer", + "duration": "366d", + "ingest": false, + "name": "rslp.satlas.data_sources.MonthlySentinel1", + "query": { + "sar:instrument_mode": { + "eq": "IW" + }, + "sar:polarizations": { + "eq": [ + "VV", + "VH" + ] + }, + "sat:orbit_state": { + "eq": "ascending" + } + }, + "query_config": { + "max_matches": 12 + }, + "time_offset": "-180d" + }, + "type": "raster" + }, + "sentinel1_descending": { + "band_sets": [ + { + "bands": [ + "vv", + "vh" + ], + "dtype": "float32" + } + ], + "data_source": { + "cache_dir": "cache/planetary_computer", + "duration": "366d", + "ingest": false, + "name": "rslp.satlas.data_sources.MonthlySentinel1", + "query": { + "sar:instrument_mode": { + "eq": "IW" + }, + "sar:polarizations": { + "eq": [ + "VV", + "VH" + ] + }, + "sat:orbit_state": { + "eq": "descending" + } + }, + "query_config": { + "max_matches": 12 + }, + "time_offset": "-180d" + }, + "type": "raster" + }, + "sentinel2": { + "band_sets": [ + { + "bands": [ + "B02", + "B03", + "B04", + "B08" + ], + "dtype": "uint16" + }, + { + "bands": [ + "B05", + "B06", + "B07", + "B8A", + "B11", + "B12" + ], + "dtype": "uint16", + "zoom_offset": -1 + }, + { + "bands": [ + "B01", + "B09" + ], + "dtype": "uint16", + "zoom_offset": -2 + } + ], + "data_source": { + "cache_dir": "cache/planetary_computer", + "duration": "366d", + "harmonize": true, + "ingest": false, + "max_cloud_cover": 50, + "name": "rslp.satlas.data_sources.MonthlyAzureSentinel2", + "query_config": { + "max_matches": 12 + }, + "sort_by": "eo:cloud_cover", + "time_offset": "-180d" + }, + "type": "raster" + } + } +} diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml new file mode 100644 index 00000000..3486a052 --- /dev/null +++ b/data/helios/togo_2020/finetune_12_months.yaml @@ -0,0 +1,271 @@ +model: + class_path: rslearn.train.lightning_module.RslearnLightningModule + init_args: + model: + class_path: rslearn.models.multitask.MultiTaskModel + init_args: + encoder: + - class_path: rslp.helios.model.Helios + init_args: + checkpoint_path: "{CHECKPOINT_PATH}" + selector: ["encoder"] + forward_kwargs: + patch_size: {PATCH_SIZE} + decoders: + crop_type_classification: + - class_path: rslearn.models.pooling_decoder.PoolingDecoder + init_args: + in_channels: {ENCODER_EMBEDDING_SIZE} + out_channels: 8 + - class_path: rslearn.train.tasks.classification.ClassificationHead + lr: 0.0001 + plateau: true + plateau_factor: 0.2 + plateau_patience: 2 + plateau_min_lr: 0 + plateau_cooldown: 10 +data: + class_path: rslearn.train.data_module.RslearnDataModule + init_args: + path: weka://dfive-default/rslearn-eai/datasets/crop/kenya_nandi/20250625 + inputs: + sentinel2_0: + data_type: "raster" + layers: ["sentinel2"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_1: + data_type: "raster" + layers: ["sentinel2.1"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_2: + data_type: "raster" + layers: ["sentinel2.2"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_3: + data_type: "raster" + layers: ["sentinel2.3"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_4: + data_type: "raster" + layers: ["sentinel2.4"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_5: + data_type: "raster" + layers: ["sentinel2.5"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_6: + data_type: "raster" + layers: ["sentinel2.6"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_7: + data_type: "raster" + layers: ["sentinel2.7"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_8: + data_type: "raster" + layers: ["sentinel2.8"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_9: + data_type: "raster" + layers: ["sentinel2.9"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_10: + data_type: "raster" + layers: ["sentinel2.10"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel2_11: + data_type: "raster" + layers: ["sentinel2.11"] + bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + passthrough: true + dtype: FLOAT32 + sentinel1_0: + data_type: "raster" + layers: ["sentinel1_ascending"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_1: + data_type: "raster" + layers: ["sentinel1_ascending.1"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_2: + data_type: "raster" + layers: ["sentinel1_ascending.2"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_3: + data_type: "raster" + layers: ["sentinel1_ascending.3"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_4: + data_type: "raster" + layers: ["sentinel1_ascending.4"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_5: + data_type: "raster" + layers: ["sentinel1_ascending.5"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_6: + data_type: "raster" + layers: ["sentinel1_ascending.6"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_7: + data_type: "raster" + layers: ["sentinel1_ascending.7"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_8: + data_type: "raster" + layers: ["sentinel1_ascending.8"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_9: + data_type: "raster" + layers: ["sentinel1_ascending.9"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_10: + data_type: "raster" + layers: ["sentinel1_ascending.10"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + sentinel1_11: + data_type: "raster" + layers: ["sentinel1_ascending.11"] + bands: ["vv", "vh"] + passthrough: true + dtype: FLOAT32 + label: + data_type: "vector" + layers: ["label"] + is_target: true + task: + class_path: rslearn.train.tasks.multi_task.MultiTask + init_args: + tasks: + crop_type_classification: + class_path: rslearn.train.tasks.classification.ClassificationTask + init_args: + property_name: "category" + classes: ["Coffee", "Trees", "Grassland", "Maize", "Sugarcane", "Tea", "Water", "Built-up"] + enable_f1_metric: true + metric_kwargs: + average: "micro" + input_mapping: + crop_type_classification: + label: "targets" + batch_size: 8 + num_workers: 32 + default_config: + transforms: + - class_path: rslearn.train.transforms.concatenate.Concatenate + init_args: + selections: + sentinel2_0: [] + sentinel2_1: [] + sentinel2_2: [] + sentinel2_3: [] + sentinel2_4: [] + sentinel2_5: [] + sentinel2_6: [] + sentinel2_7: [] + sentinel2_8: [] + sentinel2_9: [] + sentinel2_10: [] + sentinel2_11: [] + output_selector: sentinel2_l2a + - class_path: rslearn.train.transforms.concatenate.Concatenate + init_args: + selections: + sentinel1_0: [] + sentinel1_1: [] + sentinel1_2: [] + sentinel1_3: [] + sentinel1_4: [] + sentinel1_5: [] + sentinel1_6: [] + sentinel1_7: [] + sentinel1_8: [] + sentinel1_9: [] + sentinel1_10: [] + sentinel1_11: [] + output_selector: sentinel1 + - class_path: rslp.helios.norm.HeliosNormalize + init_args: + config_fname: "/opt/helios/data/norm_configs/computed.json" + band_names: + sentinel2_l2a: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"] + sentinel1: ["vv", "vh"] + - class_path: rslearn.train.transforms.pad.Pad + init_args: + size: 8 + mode: "center" + image_selectors: ["sentinel2_l2a", "sentinel1"] + train_config: + groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"] + tags: + split: "train" + val_config: + groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"] + tags: + split: "val" + test_config: + groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"] + tags: + split: "val" +trainer: + max_epochs: 100 + callbacks: + - class_path: lightning.pytorch.callbacks.LearningRateMonitor + init_args: + logging_interval: "epoch" + - class_path: lightning.pytorch.callbacks.ModelCheckpoint + init_args: + save_top_k: 1 + save_last: true + monitor: val_loss + mode: min + - class_path: rslearn.train.callbacks.freeze_unfreeze.FreezeUnfreeze + init_args: + module_selector: ["model", "encoder", 0] + unfreeze_at_epoch: 2 +rslp_project: placeholder +rslp_experiment: placeholder From 127fc82a3494c42b7085adaf69a40b38aa766bc3 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 1 Jul 2025 15:30:11 -0700 Subject: [PATCH 05/18] Use the right data for finetuning --- data/helios/togo_2020/finetune_12_months.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml index 3486a052..285c2a9a 100644 --- a/data/helios/togo_2020/finetune_12_months.yaml +++ b/data/helios/togo_2020/finetune_12_months.yaml @@ -27,7 +27,7 @@ model: data: class_path: rslearn.train.data_module.RslearnDataModule init_args: - path: weka://dfive-default/rslearn-eai/datasets/crop/kenya_nandi/20250625 + path: weka://dfive-default/rslearn-eai/datasets/crop/togo/20250701 inputs: sentinel2_0: data_type: "raster" From 54ed66d31f4c4305d60ca1bb1a01d72394c1355e Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 1 Jul 2025 15:52:00 -0700 Subject: [PATCH 06/18] Fix path --- data/helios/togo_2020/finetune_12_months.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml index 285c2a9a..38896d85 100644 --- a/data/helios/togo_2020/finetune_12_months.yaml +++ b/data/helios/togo_2020/finetune_12_months.yaml @@ -27,7 +27,7 @@ model: data: class_path: rslearn.train.data_module.RslearnDataModule init_args: - path: weka://dfive-default/rslearn-eai/datasets/crop/togo/20250701 + path: weka://dfive-default/rslearn-eai/datasets/crop/togo_2020/20250701 inputs: sentinel2_0: data_type: "raster" From a3b7c11d0196d44abdd3b979dd30479a630c0555 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 1 Jul 2025 16:01:51 -0700 Subject: [PATCH 07/18] more yaml fixes --- data/helios/togo_2020/finetune_12_months.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml index 38896d85..5719fe70 100644 --- a/data/helios/togo_2020/finetune_12_months.yaml +++ b/data/helios/togo_2020/finetune_12_months.yaml @@ -240,15 +240,15 @@ data: mode: "center" image_selectors: ["sentinel2_l2a", "sentinel1"] train_config: - groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"] + groups: ["groundtruth_window_32"] tags: split: "train" val_config: - groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"] + groups: ["groundtruth_window_32"] tags: split: "val" test_config: - groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"] + groups: ["groundtruth_window_32"] tags: split: "val" trainer: From cb278898d0ed39ef6a8dfb1bd3397e35effabd2a Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 2 Jul 2025 11:25:43 -0700 Subject: [PATCH 08/18] Fix categories for Togo finetuning --- data/helios/togo_2020/finetune_12_months.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml index 5719fe70..a6280883 100644 --- a/data/helios/togo_2020/finetune_12_months.yaml +++ b/data/helios/togo_2020/finetune_12_months.yaml @@ -185,7 +185,7 @@ data: class_path: rslearn.train.tasks.classification.ClassificationTask init_args: property_name: "category" - classes: ["Coffee", "Trees", "Grassland", "Maize", "Sugarcane", "Tea", "Water", "Built-up"] + classes: [1, 0] enable_f1_metric: true metric_kwargs: average: "micro" From 72d7a8135c82e6bb1ed7a09092133930115e8203 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 2 Jul 2025 11:30:11 -0700 Subject: [PATCH 09/18] linting --- requirements.txt | 2 +- rslp/crop/rapids_togo/create_windows_for_labels.py | 4 +--- rslp/crop/rapids_togo/to_csv.py | 12 ++++++------ 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/requirements.txt b/requirements.txt index d46ef940..879c91bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ beaker-py>=2.0 fastapi>=0.115 +geopandas>=1.1.0 google-cloud-bigtable>=2.18 google-cloud-pubsub>=2.18 interrogate>=1.7 @@ -12,4 +13,3 @@ ruff>=0.7 scikit-image>=0.23 typing-extensions>=4.11 uvicorn>=0.32 -geopandas>=1.1.0 diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py index 72c6b03e..1b1f96fa 100644 --- a/rslp/crop/rapids_togo/create_windows_for_labels.py +++ b/rslp/crop/rapids_togo/create_windows_for_labels.py @@ -8,7 +8,6 @@ import multiprocessing from datetime import datetime, timezone -import geopandas import pandas as pd import shapely import tqdm @@ -118,7 +117,7 @@ def create_windows_from_csv( """Create windows from csv. Args: - csv_path: path to the csv file + csv_paths: path to the csv files ds_path: path to the dataset group_name: name of the group window_size: window size @@ -146,7 +145,6 @@ def create_windows_from_csv( if __name__ == "__main__": - multiprocessing.set_start_method("forkserver") parser = argparse.ArgumentParser(description="Create windows from csv") parser.add_argument( diff --git a/rslp/crop/rapids_togo/to_csv.py b/rslp/crop/rapids_togo/to_csv.py index 20b5e90a..2fffa242 100644 --- a/rslp/crop/rapids_togo/to_csv.py +++ b/rslp/crop/rapids_togo/to_csv.py @@ -1,19 +1,19 @@ -""" +"""Turn shapefiles into csvs. + It's easier for google cloud if the files are in a single csv instead of in a shapefile, so we process it into csvs. """ -from upath import UPath -import pandas as pd + import geopandas +import pandas as pd +from upath import UPath def process_files(shapefile_path: UPath) -> pd.DataFrame: """Create windows for crop type mapping. Args: - csv_path: path to the csv file - num_pixels: number of points to sample from each polygon - postprocess_categories: whether to postprocess categories + shapefile_path: path to the shapefile """ df = geopandas.read_file(shapefile_path) is_crop = 1 From 276ea2b771c315611c9bd86806743cff0b5b52ea Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 2 Jul 2025 12:59:29 -0700 Subject: [PATCH 10/18] fix num classes --- data/helios/togo_2020/finetune_12_months.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml index a6280883..de7b9f1a 100644 --- a/data/helios/togo_2020/finetune_12_months.yaml +++ b/data/helios/togo_2020/finetune_12_months.yaml @@ -16,7 +16,7 @@ model: - class_path: rslearn.models.pooling_decoder.PoolingDecoder init_args: in_channels: {ENCODER_EMBEDDING_SIZE} - out_channels: 8 + out_channels: 2 - class_path: rslearn.train.tasks.classification.ClassificationHead lr: 0.0001 plateau: true From de9bf8089110bb4545968971a60159205a4951e6 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 2 Jul 2025 13:30:33 -0700 Subject: [PATCH 11/18] Add eval set --- rslp/crop/rapids_togo/to_csv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rslp/crop/rapids_togo/to_csv.py b/rslp/crop/rapids_togo/to_csv.py index 2fffa242..de3b71f9 100644 --- a/rslp/crop/rapids_togo/to_csv.py +++ b/rslp/crop/rapids_togo/to_csv.py @@ -33,6 +33,10 @@ def process_files(shapefile_path: UPath) -> pd.DataFrame: if __name__ == "__main__": - for filename in ["crop_merged_v2", "noncrop_merged_v2"]: + for filename in ["crop_merged_v2", "noncrop_merged_v2", "togo_test_majority"]: + csv_name = UPath(".") / f"{UPath(filename).stem}.csv" + if csv_name.exists(): + print(f"{csv_name} exists - skipping") + continue df = process_files(UPath(filename)) - df.to_csv(f"{UPath(filename).stem}.csv") + df.to_csv(csv_name) From 23757af770037db5997790cb05ac30740924bcef Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 2 Jul 2025 13:31:46 -0700 Subject: [PATCH 12/18] Add eval to the create_windows function too --- rslp/crop/rapids_togo/create_windows_for_labels.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py index 1b1f96fa..fe514ee7 100644 --- a/rslp/crop/rapids_togo/create_windows_for_labels.py +++ b/rslp/crop/rapids_togo/create_windows_for_labels.py @@ -122,7 +122,11 @@ def create_windows_from_csv( group_name: name of the group window_size: window size """ - for filename in ["crop_merged_v2.csv", "noncrop_merged_v2.csv"]: + for filename in [ + "crop_merged_v2.csv", + "noncrop_merged_v2.csv", + "togo_test_majority.csv", + ]: df_sampled = pd.read_csv(csv_paths / filename) csv_rows = [] for _, row in df_sampled.iterrows(): From ab884c2752963209eea6baf1f139a00a03883a65 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 2 Jul 2025 14:10:34 -0700 Subject: [PATCH 13/18] test category for the test set --- .../rapids_togo/create_windows_for_labels.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py index fe514ee7..1978104e 100644 --- a/rslp/crop/rapids_togo/create_windows_for_labels.py +++ b/rslp/crop/rapids_togo/create_windows_for_labels.py @@ -28,10 +28,7 @@ def create_window( - csv_row: pd.Series, - ds_path: UPath, - group_name: str, - window_size: int, + csv_row: pd.Series, ds_path: UPath, group_name: str, window_size: int, is_test: bool ) -> None: """Create windows for crop type mapping. @@ -40,6 +37,7 @@ def create_window( ds_path: path to the dataset group_name: name of the group window_size: window size + is_test: whether or not this is a test window """ # Get sample metadata polygon_id = csv_row["unique_id"] @@ -69,17 +67,19 @@ def create_window( int(dst_geometry.shp.y) + window_size // 2, ) - # Check if train or val. group = f"{group_name}_window_{window_size}" window_name = f"{polygon_id}_{latitude}_{longitude}" - - # If split by polygon id, no samples from the same polygon will be in the same split. - is_val = hashlib.md5(str(window_name).encode()).hexdigest()[0] in ["0", "1"] - - if is_val: - split = "val" + if not is_test: + # Check if train or val. + # If split by polygon id, no samples from the same polygon will be in the same split. + is_val = hashlib.md5(str(window_name).encode()).hexdigest()[0] in ["0", "1"] + + if is_val: + split = "val" + else: + split = "train" else: - split = "train" + split = "test" window = Window( path=Window.get_window_root(ds_path, group, window_name), From 126dad9a8d20b09dc0a8cc48acfb8c1054dcf164 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 2 Jul 2025 14:13:03 -0700 Subject: [PATCH 14/18] Actually pass the argument --- rslp/crop/rapids_togo/create_windows_for_labels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py index 1978104e..733392c7 100644 --- a/rslp/crop/rapids_togo/create_windows_for_labels.py +++ b/rslp/crop/rapids_togo/create_windows_for_labels.py @@ -138,6 +138,7 @@ def create_windows_from_csv( ds_path=ds_path, group_name=group_name, window_size=window_size, + is_test="test" in filename, ) for row in csv_rows ] From f2a4e6ce504b85c41a3e0e6c8a9b494cd5bed339 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Thu, 3 Jul 2025 10:32:40 -0700 Subject: [PATCH 15/18] keep the encoder frozen - emulate a linear probe --- data/helios/togo_2020/finetune_12_months.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml index de7b9f1a..ded43d15 100644 --- a/data/helios/togo_2020/finetune_12_months.yaml +++ b/data/helios/togo_2020/finetune_12_months.yaml @@ -266,6 +266,6 @@ trainer: - class_path: rslearn.train.callbacks.freeze_unfreeze.FreezeUnfreeze init_args: module_selector: ["model", "encoder", 0] - unfreeze_at_epoch: 2 + unfreeze_at_epoch: 100 rslp_project: placeholder rslp_experiment: placeholder From d9e6c836100e57a46cbf5d4bacd6e6101277ad00 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Thu, 3 Jul 2025 11:10:05 -0700 Subject: [PATCH 16/18] use the test set --- data/helios/togo_2020/finetune_12_months.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml index ded43d15..fb701c7a 100644 --- a/data/helios/togo_2020/finetune_12_months.yaml +++ b/data/helios/togo_2020/finetune_12_months.yaml @@ -250,7 +250,7 @@ data: test_config: groups: ["groundtruth_window_32"] tags: - split: "val" + split: "test" trainer: max_epochs: 100 callbacks: From fee1c9e9683af420ba8a2ffbdf8f727624c60002 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 9 Jul 2025 16:04:22 -0700 Subject: [PATCH 17/18] md5 can be unstable; use sha256 instead --- rslp/crop/rapids_togo/create_windows_for_labels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py index 733392c7..9c84788f 100644 --- a/rslp/crop/rapids_togo/create_windows_for_labels.py +++ b/rslp/crop/rapids_togo/create_windows_for_labels.py @@ -72,7 +72,7 @@ def create_window( if not is_test: # Check if train or val. # If split by polygon id, no samples from the same polygon will be in the same split. - is_val = hashlib.md5(str(window_name).encode()).hexdigest()[0] in ["0", "1"] + is_val = hashlib.sha256(str(window_name).encode()).hexdigest()[0] in ["0", "1"] if is_val: split = "val" From a4bf218fcdd898cd0238d0bc708213a6a90cfca7 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 9 Jul 2025 16:19:28 -0700 Subject: [PATCH 18/18] update start and end times, add comment explaining why --- rslp/crop/rapids_togo/create_windows_for_labels.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py index 9c84788f..696c3b14 100644 --- a/rslp/crop/rapids_togo/create_windows_for_labels.py +++ b/rslp/crop/rapids_togo/create_windows_for_labels.py @@ -23,8 +23,9 @@ LABEL_LAYER = "label" # data was collected in May 2020, so we consider the 6 months before and after may -START_TIME = datetime(2019, 12, 1, tzinfo=timezone.utc) -END_TIME = datetime(2020, 12, 31, tzinfo=timezone.utc) +# we pick the center month; the actual range will be managed by the offset in the config. +START_TIME = datetime(2020, 5, 1, tzinfo=timezone.utc) +END_TIME = datetime(2020, 5, 31, tzinfo=timezone.utc) def create_window(