From 327c649a045e61346ffb26ac6a0ea011477062c0 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 18 Jun 2025 13:13:59 +0100
Subject: [PATCH 01/18] rapids togo labels

---
 .../rapids_togo/create_windows_for_labels.py  | 209 ++++++++++++++++++
 1 file changed, 209 insertions(+)
 create mode 100644 rslp/crop/rapids_togo/create_windows_for_labels.py

diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py
new file mode 100644
index 00000000..a24d0274
--- /dev/null
+++ b/rslp/crop/rapids_togo/create_windows_for_labels.py
@@ -0,0 +1,209 @@
+"""Create windows for crop type mapping.
+
+Data from https://zenodo.org/records/3836629
+"""
+
+import argparse
+import hashlib
+import multiprocessing
+from datetime import datetime, timezone
+
+import geopandas
+import pandas as pd
+import shapely
+import tqdm
+from rslearn.const import WGS84_PROJECTION
+from rslearn.dataset import Window
+from rslearn.utils import Projection, STGeometry, get_utm_ups_crs
+from rslearn.utils.feature import Feature
+from rslearn.utils.mp import star_imap_unordered
+from rslearn.utils.vector_format import GeojsonVectorFormat
+from upath import UPath
+
+WINDOW_RESOLUTION = 10
+LABEL_LAYER = "label"
+
+# data was collected in May 2020, so we consider the 6 months before and after may
+START_TIME = datetime(2019, 12, 1, tzinfo=timezone.utc)
+END_TIME = datetime(2020, 12, 31, tzinfo=timezone.utc)
+
+
+def process_files(shapefile_path: UPath) -> pd.DataFrame:
+    """Create windows for crop type mapping.
+
+    Args:
+        csv_path: path to the csv file
+        num_pixels: number of points to sample from each polygon
+        postprocess_categories: whether to postprocess categories
+    """
+    df = geopandas.read_file(shapefile_path)
+    is_crop = 1
+    if "non" in shapefile_path.name.lower():
+        is_crop = 0
+
+    df["is_crop"] = is_crop
+
+    df["longitude"] = df.geometry.centroid.x
+    df["latitude"] = df.geometry.centroid.y
+
+    df["org_file"] = shapefile_path.name
+    df.reset_index()
+    df["unique_id"] = df.apply(lambda x: f"{x.name}-{x.org_file}", axis=1)
+
+    return df[["is_crop", "geometry", "latitude", "longitude", "org_file", "unique_id"]]
+
+
+def create_window(
+    csv_row: pd.Series,
+    ds_path: UPath,
+    group_name: str,
+    window_size: int,
+) -> None:
+    """Create windows for crop type mapping.
+
+    Args:
+        csv_row: a row of the dataframe
+        ds_path: path to the dataset
+        group_name: name of the group
+        window_size: window size
+    """
+    # Get sample metadata
+    polygon_id = csv_row["unique_id"]
+    latitude, longitude = csv_row["latitude"], csv_row["longitude"]
+    is_crop = csv_row["is_crop"]
+    category = is_crop
+
+    src_point = shapely.Point(longitude, latitude)
+    src_geometry = STGeometry(WGS84_PROJECTION, src_point, None)
+    dst_crs = get_utm_ups_crs(longitude, latitude)
+    dst_projection = Projection(dst_crs, WINDOW_RESOLUTION, -WINDOW_RESOLUTION)
+    dst_geometry = src_geometry.to_projection(dst_projection)
+
+    # This is specific for window size = 1.
+    if window_size == 1:
+        bounds = (
+            int(dst_geometry.shp.x),
+            int(dst_geometry.shp.y) - window_size,
+            int(dst_geometry.shp.x) + window_size,
+            int(dst_geometry.shp.y),
+        )
+    else:
+        bounds = (
+            int(dst_geometry.shp.x),
+            int(dst_geometry.shp.y),
+            int(dst_geometry.shp.x) + window_size // 2,
+            int(dst_geometry.shp.y) + window_size // 2,
+        )
+
+    # Check if train or val.
+    group = f"{group_name}_window_{window_size}"
+    window_name = f"{polygon_id}_{latitude}_{longitude}"
+
+    # If split by polygon id, no samples from the same polygon will be in the same split.
+    is_val = hashlib.md5(str(window_name).encode()).hexdigest()[0] in ["0", "1"]
+
+    if is_val:
+        split = "val"
+    else:
+        split = "train"
+
+    window = Window(
+        path=Window.get_window_root(ds_path, group, window_name),
+        group=group,
+        name=window_name,
+        projection=dst_projection,
+        bounds=bounds,
+        time_range=(START_TIME, END_TIME),
+        options={
+            "split": split,
+            "is_crop": is_crop,
+            "category": category,
+        },
+    )
+    window.save()
+
+    # Add the label.
+    feature = Feature(
+        window.get_geometry(),
+        {
+            "category": category,
+        },
+    )
+    layer_dir = window.get_layer_dir(LABEL_LAYER)
+    GeojsonVectorFormat().encode_vector(layer_dir, [feature])
+    window.mark_layer_completed(LABEL_LAYER)
+
+
+def create_windows_from_csv(
+    csv_paths: UPath,
+    ds_path: UPath,
+    group_name: str,
+    window_size: int,
+) -> None:
+    """Create windows from csv.
+
+    Args:
+        csv_path: path to the csv file
+        ds_path: path to the dataset
+        group_name: name of the group
+        window_size: window size
+    """
+    for filename in ["crop_merged_v2", "noncrop_merged_v2"]:
+        df_sampled = process_files(csv_paths / filename)
+        csv_rows = []
+        for _, row in df_sampled.iterrows():
+            csv_rows.append(row)
+
+        jobs = [
+            dict(
+                csv_row=row,
+                ds_path=ds_path,
+                group_name=group_name,
+                window_size=window_size,
+            )
+            for row in csv_rows
+        ]
+        p = multiprocessing.Pool(32)
+        outputs = star_imap_unordered(p, create_window, jobs)
+        for _ in tqdm.tqdm(outputs, total=len(jobs)):
+            pass
+        p.close()
+
+
+if __name__ == "__main__":
+
+    multiprocessing.set_start_method("forkserver")
+    parser = argparse.ArgumentParser(description="Create windows from csv")
+    parser.add_argument(
+        "--csv_paths",
+        type=str,
+        required=True,
+        help="Path to the csv file",
+    )
+    parser.add_argument(
+        "--ds_path",
+        type=str,
+        required=True,
+        help="Path to the dataset",
+    )
+    parser.add_argument(
+        "--group_name",
+        type=str,
+        required=False,
+        help="Name of the group",
+        default="groundtruth",
+    )
+    parser.add_argument(
+        "--window_size",
+        type=int,
+        required=False,
+        help="Window size",
+        default=1,
+    )
+    args = parser.parse_args()
+    create_windows_from_csv(
+        UPath(args.csv_paths),
+        UPath(args.ds_path),
+        args.group_name,
+        window_size=args.window_size,
+    )

From 3a79a54c4ee2217b85f47a80e69026bd5f3dc021 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 18 Jun 2025 13:31:37 +0100
Subject: [PATCH 02/18] Add geopandas to requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 7358718e..d46ef940 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,4 @@ ruff>=0.7
 scikit-image>=0.23
 typing-extensions>=4.11
 uvicorn>=0.32
+geopandas>=1.1.0

From 492a2316e92c64856ce4ecf5160565dba3039ced Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 1 Jul 2025 14:39:42 -0700
Subject: [PATCH 03/18] use csvs instead of shapefiles

---
 .../rapids_togo/create_windows_for_labels.py  | 31 ++-------------
 rslp/crop/rapids_togo/to_csv.py               | 38 +++++++++++++++++++
 2 files changed, 41 insertions(+), 28 deletions(-)
 create mode 100644 rslp/crop/rapids_togo/to_csv.py

diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py
index a24d0274..72c6b03e 100644
--- a/rslp/crop/rapids_togo/create_windows_for_labels.py
+++ b/rslp/crop/rapids_togo/create_windows_for_labels.py
@@ -28,31 +28,6 @@
 END_TIME = datetime(2020, 12, 31, tzinfo=timezone.utc)
 
 
-def process_files(shapefile_path: UPath) -> pd.DataFrame:
-    """Create windows for crop type mapping.
-
-    Args:
-        csv_path: path to the csv file
-        num_pixels: number of points to sample from each polygon
-        postprocess_categories: whether to postprocess categories
-    """
-    df = geopandas.read_file(shapefile_path)
-    is_crop = 1
-    if "non" in shapefile_path.name.lower():
-        is_crop = 0
-
-    df["is_crop"] = is_crop
-
-    df["longitude"] = df.geometry.centroid.x
-    df["latitude"] = df.geometry.centroid.y
-
-    df["org_file"] = shapefile_path.name
-    df.reset_index()
-    df["unique_id"] = df.apply(lambda x: f"{x.name}-{x.org_file}", axis=1)
-
-    return df[["is_crop", "geometry", "latitude", "longitude", "org_file", "unique_id"]]
-
-
 def create_window(
     csv_row: pd.Series,
     ds_path: UPath,
@@ -148,8 +123,8 @@ def create_windows_from_csv(
         group_name: name of the group
         window_size: window size
     """
-    for filename in ["crop_merged_v2", "noncrop_merged_v2"]:
-        df_sampled = process_files(csv_paths / filename)
+    for filename in ["crop_merged_v2.csv", "noncrop_merged_v2.csv"]:
+        df_sampled = pd.read_csv(csv_paths / filename)
         csv_rows = []
         for _, row in df_sampled.iterrows():
             csv_rows.append(row)
@@ -177,7 +152,7 @@ def create_windows_from_csv(
     parser.add_argument(
         "--csv_paths",
         type=str,
-        required=True,
+        default="gs://ai2-helios-us-central1/evaluations/crop_type_mapping/togo_2020",
         help="Path to the csv file",
     )
     parser.add_argument(
diff --git a/rslp/crop/rapids_togo/to_csv.py b/rslp/crop/rapids_togo/to_csv.py
new file mode 100644
index 00000000..20b5e90a
--- /dev/null
+++ b/rslp/crop/rapids_togo/to_csv.py
@@ -0,0 +1,38 @@
+"""
+It's easier for google cloud if the files are in a single csv instead of
+in a shapefile, so we process it into csvs.
+"""
+from upath import UPath
+import pandas as pd
+import geopandas
+
+
+def process_files(shapefile_path: UPath) -> pd.DataFrame:
+    """Create windows for crop type mapping.
+
+    Args:
+        csv_path: path to the csv file
+        num_pixels: number of points to sample from each polygon
+        postprocess_categories: whether to postprocess categories
+    """
+    df = geopandas.read_file(shapefile_path)
+    is_crop = 1
+    if "non" in shapefile_path.name.lower():
+        is_crop = 0
+
+    df["is_crop"] = is_crop
+
+    df["longitude"] = df.geometry.centroid.x
+    df["latitude"] = df.geometry.centroid.y
+
+    df["org_file"] = shapefile_path.name
+    df.reset_index()
+    df["unique_id"] = df.apply(lambda x: f"{x.name}-{x.org_file}", axis=1)
+
+    return df[["is_crop", "latitude", "longitude", "org_file", "unique_id"]]
+
+
+if __name__ == "__main__":
+    for filename in ["crop_merged_v2", "noncrop_merged_v2"]:
+        df = process_files(UPath(filename))
+        df.to_csv(f"{UPath(filename).stem}.csv")

From 8fd616cb5d58493701798b30f9f7d9630ef4b8c1 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 1 Jul 2025 15:05:56 -0700
Subject: [PATCH 04/18] Add config and finetuning yaml for Togo 2020

---
 data/helios/togo_2020/config.json             | 126 ++++++++
 data/helios/togo_2020/finetune_12_months.yaml | 271 ++++++++++++++++++
 2 files changed, 397 insertions(+)
 create mode 100644 data/helios/togo_2020/config.json
 create mode 100644 data/helios/togo_2020/finetune_12_months.yaml

diff --git a/data/helios/togo_2020/config.json b/data/helios/togo_2020/config.json
new file mode 100644
index 00000000..8f9cd926
--- /dev/null
+++ b/data/helios/togo_2020/config.json
@@ -0,0 +1,126 @@
+{
+  "layers": {
+    "label": {
+      "type": "vector"
+    },
+    "sentinel1_ascending": {
+      "band_sets": [
+        {
+          "bands": [
+            "vv",
+            "vh"
+          ],
+          "dtype": "float32"
+        }
+      ],
+      "data_source": {
+        "cache_dir": "cache/planetary_computer",
+        "duration": "366d",
+        "ingest": false,
+        "name": "rslp.satlas.data_sources.MonthlySentinel1",
+        "query": {
+          "sar:instrument_mode": {
+            "eq": "IW"
+          },
+          "sar:polarizations": {
+            "eq": [
+              "VV",
+              "VH"
+            ]
+          },
+          "sat:orbit_state": {
+            "eq": "ascending"
+          }
+        },
+        "query_config": {
+          "max_matches": 12
+        },
+        "time_offset": "-180d"
+      },
+      "type": "raster"
+    },
+    "sentinel1_descending": {
+      "band_sets": [
+        {
+          "bands": [
+            "vv",
+            "vh"
+          ],
+          "dtype": "float32"
+        }
+      ],
+      "data_source": {
+        "cache_dir": "cache/planetary_computer",
+        "duration": "366d",
+        "ingest": false,
+        "name": "rslp.satlas.data_sources.MonthlySentinel1",
+        "query": {
+          "sar:instrument_mode": {
+            "eq": "IW"
+          },
+          "sar:polarizations": {
+            "eq": [
+              "VV",
+              "VH"
+            ]
+          },
+          "sat:orbit_state": {
+            "eq": "descending"
+          }
+        },
+        "query_config": {
+          "max_matches": 12
+        },
+        "time_offset": "-180d"
+      },
+      "type": "raster"
+    },
+    "sentinel2": {
+      "band_sets": [
+        {
+          "bands": [
+            "B02",
+            "B03",
+            "B04",
+            "B08"
+          ],
+          "dtype": "uint16"
+        },
+        {
+          "bands": [
+            "B05",
+            "B06",
+            "B07",
+            "B8A",
+            "B11",
+            "B12"
+          ],
+          "dtype": "uint16",
+          "zoom_offset": -1
+        },
+        {
+          "bands": [
+            "B01",
+            "B09"
+          ],
+          "dtype": "uint16",
+          "zoom_offset": -2
+        }
+      ],
+      "data_source": {
+        "cache_dir": "cache/planetary_computer",
+        "duration": "366d",
+        "harmonize": true,
+        "ingest": false,
+        "max_cloud_cover": 50,
+        "name": "rslp.satlas.data_sources.MonthlyAzureSentinel2",
+        "query_config": {
+          "max_matches": 12
+        },
+        "sort_by": "eo:cloud_cover",
+        "time_offset": "-180d"
+      },
+      "type": "raster"
+    }
+  }
+}
diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml
new file mode 100644
index 00000000..3486a052
--- /dev/null
+++ b/data/helios/togo_2020/finetune_12_months.yaml
@@ -0,0 +1,271 @@
+model:
+  class_path: rslearn.train.lightning_module.RslearnLightningModule
+  init_args:
+    model:
+      class_path: rslearn.models.multitask.MultiTaskModel
+      init_args:
+        encoder:
+          - class_path: rslp.helios.model.Helios
+            init_args:
+              checkpoint_path: "{CHECKPOINT_PATH}"
+              selector: ["encoder"]
+              forward_kwargs:
+                patch_size: {PATCH_SIZE}
+        decoders:
+          crop_type_classification:
+            - class_path: rslearn.models.pooling_decoder.PoolingDecoder
+              init_args:
+                in_channels: {ENCODER_EMBEDDING_SIZE}
+                out_channels: 8
+            - class_path: rslearn.train.tasks.classification.ClassificationHead
+    lr: 0.0001
+    plateau: true
+    plateau_factor: 0.2
+    plateau_patience: 2
+    plateau_min_lr: 0
+    plateau_cooldown: 10
+data:
+  class_path: rslearn.train.data_module.RslearnDataModule
+  init_args:
+    path: weka://dfive-default/rslearn-eai/datasets/crop/kenya_nandi/20250625
+    inputs:
+      sentinel2_0:
+        data_type: "raster"
+        layers: ["sentinel2"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_1:
+        data_type: "raster"
+        layers: ["sentinel2.1"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_2:
+        data_type: "raster"
+        layers: ["sentinel2.2"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_3:
+        data_type: "raster"
+        layers: ["sentinel2.3"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_4:
+        data_type: "raster"
+        layers: ["sentinel2.4"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_5:
+        data_type: "raster"
+        layers: ["sentinel2.5"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_6:
+        data_type: "raster"
+        layers: ["sentinel2.6"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_7:
+        data_type: "raster"
+        layers: ["sentinel2.7"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_8:
+        data_type: "raster"
+        layers: ["sentinel2.8"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_9:
+        data_type: "raster"
+        layers: ["sentinel2.9"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_10:
+        data_type: "raster"
+        layers: ["sentinel2.10"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel2_11:
+        data_type: "raster"
+        layers: ["sentinel2.11"]
+        bands: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_0:
+        data_type: "raster"
+        layers: ["sentinel1_ascending"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_1:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.1"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_2:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.2"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_3:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.3"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_4:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.4"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_5:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.5"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_6:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.6"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_7:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.7"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_8:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.8"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_9:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.9"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_10:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.10"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      sentinel1_11:
+        data_type: "raster"
+        layers: ["sentinel1_ascending.11"]
+        bands: ["vv", "vh"]
+        passthrough: true
+        dtype: FLOAT32
+      label:
+        data_type: "vector"
+        layers: ["label"]
+        is_target: true
+    task:
+      class_path: rslearn.train.tasks.multi_task.MultiTask
+      init_args:
+        tasks:
+          crop_type_classification:
+            class_path: rslearn.train.tasks.classification.ClassificationTask
+            init_args:
+              property_name: "category"
+              classes: ["Coffee", "Trees", "Grassland", "Maize", "Sugarcane", "Tea", "Water", "Built-up"]
+              enable_f1_metric: true
+              metric_kwargs:
+                average: "micro"
+        input_mapping:
+          crop_type_classification:
+            label: "targets"
+    batch_size: 8
+    num_workers: 32
+    default_config:
+      transforms:
+        - class_path: rslearn.train.transforms.concatenate.Concatenate
+          init_args:
+            selections:
+              sentinel2_0: []
+              sentinel2_1: []
+              sentinel2_2: []
+              sentinel2_3: []
+              sentinel2_4: []
+              sentinel2_5: []
+              sentinel2_6: []
+              sentinel2_7: []
+              sentinel2_8: []
+              sentinel2_9: []
+              sentinel2_10: []
+              sentinel2_11: []
+            output_selector: sentinel2_l2a
+        - class_path: rslearn.train.transforms.concatenate.Concatenate
+          init_args:
+            selections:
+              sentinel1_0: []
+              sentinel1_1: []
+              sentinel1_2: []
+              sentinel1_3: []
+              sentinel1_4: []
+              sentinel1_5: []
+              sentinel1_6: []
+              sentinel1_7: []
+              sentinel1_8: []
+              sentinel1_9: []
+              sentinel1_10: []
+              sentinel1_11: []
+            output_selector: sentinel1
+        - class_path: rslp.helios.norm.HeliosNormalize
+          init_args:
+            config_fname: "/opt/helios/data/norm_configs/computed.json"
+            band_names:
+              sentinel2_l2a: ["B02", "B03", "B04", "B08", "B05", "B06", "B07", "B8A", "B11", "B12", "B01", "B09"]
+              sentinel1: ["vv", "vh"]
+        - class_path: rslearn.train.transforms.pad.Pad
+          init_args:
+            size: 8
+            mode: "center"
+            image_selectors: ["sentinel2_l2a", "sentinel1"]
+    train_config:
+      groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"]
+      tags:
+        split: "train"
+    val_config:
+      groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"]
+      tags:
+        split: "val"
+    test_config:
+      groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"]
+      tags:
+        split: "val"
+trainer:
+  max_epochs: 100
+  callbacks:
+    - class_path: lightning.pytorch.callbacks.LearningRateMonitor
+      init_args:
+        logging_interval: "epoch"
+    - class_path: lightning.pytorch.callbacks.ModelCheckpoint
+      init_args:
+        save_top_k: 1
+        save_last: true
+        monitor: val_loss
+        mode: min
+    - class_path: rslearn.train.callbacks.freeze_unfreeze.FreezeUnfreeze
+      init_args:
+        module_selector: ["model", "encoder", 0]
+        unfreeze_at_epoch: 2
+rslp_project: placeholder
+rslp_experiment: placeholder

From 127fc82a3494c42b7085adaf69a40b38aa766bc3 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 1 Jul 2025 15:30:11 -0700
Subject: [PATCH 05/18] Use the right data for finetuning

---
 data/helios/togo_2020/finetune_12_months.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml
index 3486a052..285c2a9a 100644
--- a/data/helios/togo_2020/finetune_12_months.yaml
+++ b/data/helios/togo_2020/finetune_12_months.yaml
@@ -27,7 +27,7 @@ model:
 data:
   class_path: rslearn.train.data_module.RslearnDataModule
   init_args:
-    path: weka://dfive-default/rslearn-eai/datasets/crop/kenya_nandi/20250625
+    path: weka://dfive-default/rslearn-eai/datasets/crop/togo/20250701
     inputs:
       sentinel2_0:
         data_type: "raster"

From 54ed66d31f4c4305d60ca1bb1a01d72394c1355e Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 1 Jul 2025 15:52:00 -0700
Subject: [PATCH 06/18] Fix path

---
 data/helios/togo_2020/finetune_12_months.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml
index 285c2a9a..38896d85 100644
--- a/data/helios/togo_2020/finetune_12_months.yaml
+++ b/data/helios/togo_2020/finetune_12_months.yaml
@@ -27,7 +27,7 @@ model:
 data:
   class_path: rslearn.train.data_module.RslearnDataModule
   init_args:
-    path: weka://dfive-default/rslearn-eai/datasets/crop/togo/20250701
+    path: weka://dfive-default/rslearn-eai/datasets/crop/togo_2020/20250701
     inputs:
       sentinel2_0:
         data_type: "raster"

From a3b7c11d0196d44abdd3b979dd30479a630c0555 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 1 Jul 2025 16:01:51 -0700
Subject: [PATCH 07/18] more yaml fixes

---
 data/helios/togo_2020/finetune_12_months.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml
index 38896d85..5719fe70 100644
--- a/data/helios/togo_2020/finetune_12_months.yaml
+++ b/data/helios/togo_2020/finetune_12_months.yaml
@@ -240,15 +240,15 @@ data:
             mode: "center"
             image_selectors: ["sentinel2_l2a", "sentinel1"]
     train_config:
-      groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"]
+      groups: ["groundtruth_window_32"]
       tags:
         split: "train"
     val_config:
-      groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"]
+      groups: ["groundtruth_window_32"]
       tags:
         split: "val"
     test_config:
-      groups: ["groundtruth_polygon_split_window_32", "worldcover_window_32"]
+      groups: ["groundtruth_window_32"]
       tags:
         split: "val"
 trainer:

From cb278898d0ed39ef6a8dfb1bd3397e35effabd2a Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 2 Jul 2025 11:25:43 -0700
Subject: [PATCH 08/18] Fix categories for Togo finetuning

---
 data/helios/togo_2020/finetune_12_months.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml
index 5719fe70..a6280883 100644
--- a/data/helios/togo_2020/finetune_12_months.yaml
+++ b/data/helios/togo_2020/finetune_12_months.yaml
@@ -185,7 +185,7 @@ data:
             class_path: rslearn.train.tasks.classification.ClassificationTask
             init_args:
               property_name: "category"
-              classes: ["Coffee", "Trees", "Grassland", "Maize", "Sugarcane", "Tea", "Water", "Built-up"]
+              classes: [1, 0]
               enable_f1_metric: true
               metric_kwargs:
                 average: "micro"

From 72d7a8135c82e6bb1ed7a09092133930115e8203 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 2 Jul 2025 11:30:11 -0700
Subject: [PATCH 09/18] linting

---
 requirements.txt                                   |  2 +-
 rslp/crop/rapids_togo/create_windows_for_labels.py |  4 +---
 rslp/crop/rapids_togo/to_csv.py                    | 12 ++++++------
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d46ef940..879c91bd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 beaker-py>=2.0
 fastapi>=0.115
+geopandas>=1.1.0
 google-cloud-bigtable>=2.18
 google-cloud-pubsub>=2.18
 interrogate>=1.7
@@ -12,4 +13,3 @@ ruff>=0.7
 scikit-image>=0.23
 typing-extensions>=4.11
 uvicorn>=0.32
-geopandas>=1.1.0
diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py
index 72c6b03e..1b1f96fa 100644
--- a/rslp/crop/rapids_togo/create_windows_for_labels.py
+++ b/rslp/crop/rapids_togo/create_windows_for_labels.py
@@ -8,7 +8,6 @@
 import multiprocessing
 from datetime import datetime, timezone
 
-import geopandas
 import pandas as pd
 import shapely
 import tqdm
@@ -118,7 +117,7 @@ def create_windows_from_csv(
     """Create windows from csv.
 
     Args:
-        csv_path: path to the csv file
+        csv_paths: path to the csv files
         ds_path: path to the dataset
         group_name: name of the group
         window_size: window size
@@ -146,7 +145,6 @@ def create_windows_from_csv(
 
 
 if __name__ == "__main__":
-
     multiprocessing.set_start_method("forkserver")
     parser = argparse.ArgumentParser(description="Create windows from csv")
     parser.add_argument(
diff --git a/rslp/crop/rapids_togo/to_csv.py b/rslp/crop/rapids_togo/to_csv.py
index 20b5e90a..2fffa242 100644
--- a/rslp/crop/rapids_togo/to_csv.py
+++ b/rslp/crop/rapids_togo/to_csv.py
@@ -1,19 +1,19 @@
-"""
+"""Turn shapefiles into csvs.
+
 It's easier for google cloud if the files are in a single csv instead of
 in a shapefile, so we process it into csvs.
 """
-from upath import UPath
-import pandas as pd
+
 import geopandas
+import pandas as pd
+from upath import UPath
 
 
 def process_files(shapefile_path: UPath) -> pd.DataFrame:
     """Create windows for crop type mapping.
 
     Args:
-        csv_path: path to the csv file
-        num_pixels: number of points to sample from each polygon
-        postprocess_categories: whether to postprocess categories
+        shapefile_path: path to the shapefile
     """
     df = geopandas.read_file(shapefile_path)
     is_crop = 1

From 276ea2b771c315611c9bd86806743cff0b5b52ea Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 2 Jul 2025 12:59:29 -0700
Subject: [PATCH 10/18] fix num classes

---
 data/helios/togo_2020/finetune_12_months.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml
index a6280883..de7b9f1a 100644
--- a/data/helios/togo_2020/finetune_12_months.yaml
+++ b/data/helios/togo_2020/finetune_12_months.yaml
@@ -16,7 +16,7 @@ model:
             - class_path: rslearn.models.pooling_decoder.PoolingDecoder
               init_args:
                 in_channels: {ENCODER_EMBEDDING_SIZE}
-                out_channels: 8
+                out_channels: 2
             - class_path: rslearn.train.tasks.classification.ClassificationHead
     lr: 0.0001
     plateau: true

From de9bf8089110bb4545968971a60159205a4951e6 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 2 Jul 2025 13:30:33 -0700
Subject: [PATCH 11/18] Add eval set

---
 rslp/crop/rapids_togo/to_csv.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/rslp/crop/rapids_togo/to_csv.py b/rslp/crop/rapids_togo/to_csv.py
index 2fffa242..de3b71f9 100644
--- a/rslp/crop/rapids_togo/to_csv.py
+++ b/rslp/crop/rapids_togo/to_csv.py
@@ -33,6 +33,10 @@ def process_files(shapefile_path: UPath) -> pd.DataFrame:
 
 
 if __name__ == "__main__":
-    for filename in ["crop_merged_v2", "noncrop_merged_v2"]:
+    for filename in ["crop_merged_v2", "noncrop_merged_v2", "togo_test_majority"]:
+        csv_name = UPath(".") / f"{UPath(filename).stem}.csv"
+        if csv_name.exists():
+            print(f"{csv_name} exists - skipping")
+            continue
         df = process_files(UPath(filename))
-        df.to_csv(f"{UPath(filename).stem}.csv")
+        df.to_csv(csv_name)

From 23757af770037db5997790cb05ac30740924bcef Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 2 Jul 2025 13:31:46 -0700
Subject: [PATCH 12/18] Add eval to the create_windows function too

---
 rslp/crop/rapids_togo/create_windows_for_labels.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py
index 1b1f96fa..fe514ee7 100644
--- a/rslp/crop/rapids_togo/create_windows_for_labels.py
+++ b/rslp/crop/rapids_togo/create_windows_for_labels.py
@@ -122,7 +122,11 @@ def create_windows_from_csv(
         group_name: name of the group
         window_size: window size
     """
-    for filename in ["crop_merged_v2.csv", "noncrop_merged_v2.csv"]:
+    for filename in [
+        "crop_merged_v2.csv",
+        "noncrop_merged_v2.csv",
+        "togo_test_majority.csv",
+    ]:
         df_sampled = pd.read_csv(csv_paths / filename)
         csv_rows = []
         for _, row in df_sampled.iterrows():

From ab884c2752963209eea6baf1f139a00a03883a65 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 2 Jul 2025 14:10:34 -0700
Subject: [PATCH 13/18] test category for the test set

---
 .../rapids_togo/create_windows_for_labels.py  | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py
index fe514ee7..1978104e 100644
--- a/rslp/crop/rapids_togo/create_windows_for_labels.py
+++ b/rslp/crop/rapids_togo/create_windows_for_labels.py
@@ -28,10 +28,7 @@
 
 
 def create_window(
-    csv_row: pd.Series,
-    ds_path: UPath,
-    group_name: str,
-    window_size: int,
+    csv_row: pd.Series, ds_path: UPath, group_name: str, window_size: int, is_test: bool
 ) -> None:
     """Create windows for crop type mapping.
 
@@ -40,6 +37,7 @@ def create_window(
         ds_path: path to the dataset
         group_name: name of the group
         window_size: window size
+        is_test: whether or not this is a test window
     """
     # Get sample metadata
     polygon_id = csv_row["unique_id"]
@@ -69,17 +67,19 @@ def create_window(
             int(dst_geometry.shp.y) + window_size // 2,
         )
 
-    # Check if train or val.
     group = f"{group_name}_window_{window_size}"
     window_name = f"{polygon_id}_{latitude}_{longitude}"
-
-    # If split by polygon id, no samples from the same polygon will be in the same split.
-    is_val = hashlib.md5(str(window_name).encode()).hexdigest()[0] in ["0", "1"]
-
-    if is_val:
-        split = "val"
+    if not is_test:
+        # Check if train or val.
+        # If split by polygon id, no samples from the same polygon will be in the same split.
+        is_val = hashlib.md5(str(window_name).encode()).hexdigest()[0] in ["0", "1"]
+
+        if is_val:
+            split = "val"
+        else:
+            split = "train"
     else:
-        split = "train"
+        split = "test"
 
     window = Window(
         path=Window.get_window_root(ds_path, group, window_name),

From 126dad9a8d20b09dc0a8cc48acfb8c1054dcf164 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 2 Jul 2025 14:13:03 -0700
Subject: [PATCH 14/18] Actually pass the argument

---
 rslp/crop/rapids_togo/create_windows_for_labels.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py
index 1978104e..733392c7 100644
--- a/rslp/crop/rapids_togo/create_windows_for_labels.py
+++ b/rslp/crop/rapids_togo/create_windows_for_labels.py
@@ -138,6 +138,7 @@ def create_windows_from_csv(
                 ds_path=ds_path,
                 group_name=group_name,
                 window_size=window_size,
+                is_test="test" in filename,
             )
             for row in csv_rows
         ]

From f2a4e6ce504b85c41a3e0e6c8a9b494cd5bed339 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Thu, 3 Jul 2025 10:32:40 -0700
Subject: [PATCH 15/18] keep the encoder frozen - emulate a linear probe

---
 data/helios/togo_2020/finetune_12_months.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml
index de7b9f1a..ded43d15 100644
--- a/data/helios/togo_2020/finetune_12_months.yaml
+++ b/data/helios/togo_2020/finetune_12_months.yaml
@@ -266,6 +266,6 @@ trainer:
     - class_path: rslearn.train.callbacks.freeze_unfreeze.FreezeUnfreeze
       init_args:
         module_selector: ["model", "encoder", 0]
-        unfreeze_at_epoch: 2
+        unfreeze_at_epoch: 100
 rslp_project: placeholder
 rslp_experiment: placeholder

From d9e6c836100e57a46cbf5d4bacd6e6101277ad00 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Thu, 3 Jul 2025 11:10:05 -0700
Subject: [PATCH 16/18] use the test set

---
 data/helios/togo_2020/finetune_12_months.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/helios/togo_2020/finetune_12_months.yaml b/data/helios/togo_2020/finetune_12_months.yaml
index ded43d15..fb701c7a 100644
--- a/data/helios/togo_2020/finetune_12_months.yaml
+++ b/data/helios/togo_2020/finetune_12_months.yaml
@@ -250,7 +250,7 @@ data:
     test_config:
       groups: ["groundtruth_window_32"]
       tags:
-        split: "val"
+        split: "test"
 trainer:
   max_epochs: 100
   callbacks:

From fee1c9e9683af420ba8a2ffbdf8f727624c60002 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 9 Jul 2025 16:04:22 -0700
Subject: [PATCH 17/18] md5 can be unstable; use sha256 instead

---
 rslp/crop/rapids_togo/create_windows_for_labels.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py
index 733392c7..9c84788f 100644
--- a/rslp/crop/rapids_togo/create_windows_for_labels.py
+++ b/rslp/crop/rapids_togo/create_windows_for_labels.py
@@ -72,7 +72,7 @@ def create_window(
     if not is_test:
         # Check if train or val.
         # If split by polygon id, no samples from the same polygon will be in the same split.
-        is_val = hashlib.md5(str(window_name).encode()).hexdigest()[0] in ["0", "1"]
+        is_val = hashlib.sha256(str(window_name).encode()).hexdigest()[0] in ["0", "1"]
 
         if is_val:
             split = "val"

From a4bf218fcdd898cd0238d0bc708213a6a90cfca7 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 9 Jul 2025 16:19:28 -0700
Subject: [PATCH 18/18] update start and end times, add comment explaining why

---
 rslp/crop/rapids_togo/create_windows_for_labels.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rslp/crop/rapids_togo/create_windows_for_labels.py b/rslp/crop/rapids_togo/create_windows_for_labels.py
index 9c84788f..696c3b14 100644
--- a/rslp/crop/rapids_togo/create_windows_for_labels.py
+++ b/rslp/crop/rapids_togo/create_windows_for_labels.py
@@ -23,8 +23,9 @@
 LABEL_LAYER = "label"
 
 # data was collected in May 2020, so we consider the 6 months before and after may
-START_TIME = datetime(2019, 12, 1, tzinfo=timezone.utc)
-END_TIME = datetime(2020, 12, 31, tzinfo=timezone.utc)
+# we pick the center month; the actual range will be managed by the offset in the config.
+START_TIME = datetime(2020, 5, 1, tzinfo=timezone.utc)
+END_TIME = datetime(2020, 5, 31, tzinfo=timezone.utc)
 
 
 def create_window(