Merge pull request #311 from NatLabRockies/bnb/collection

bnb32 · web-flow · commit 2046f8bf3dcb · 2026-03-23T14:42:14.000-07:00
nc collection for arbitrary chunk shapes
diff --git a/sup3r/pipeline/forward_pass.py b/sup3r/pipeline/forward_pass.py
@@ -670,6 +670,8 @@ def run_chunk(
                 data=output_data,
                 features=lowered(model.hr_out_features),
                 lat_lon=chunk.hr_lat_lon,
+                row_inds=chunk.row_inds,
+                col_inds=chunk.col_inds,
                 times=chunk.hr_times,
                 out_file=chunk.out_file,
                 meta_data=meta,
diff --git a/sup3r/pipeline/strategy.py b/sup3r/pipeline/strategy.py
@@ -46,6 +46,8 @@ class ForwardPassChunk:
     hr_lat_lon: Union[np.ndarray, da.core.Array]
     hr_times: pd.DatetimeIndex
     gids: Union[np.ndarray, da.core.Array]
+    row_inds: np.ndarray
+    col_inds: np.ndarray
     out_file: str
     pad_width: tuple[tuple, tuple, tuple]
     index: int
@@ -458,6 +460,16 @@ def hr_lat_lon(self):
         )
         return OutputHandler.get_lat_lon(lr_lat_lon, shape)
 
+    @cached_property
+    def grid_inds(self):
+        """Get row and column indices for the full high resolution grid. This
+        is used to collect spatially contiguous data for stitching output
+        chunks back together."""
+        shape = self.hr_lat_lon.shape[:-1]
+        row_inds = np.arange(shape[0])
+        col_inds = np.arange(shape[1])
+        return row_inds, col_inds
+
     @cached_property
     def out_files(self):
         """Get list of output file names for each file chunk forward pass."""
@@ -580,6 +592,8 @@ def init_chunk(self, chunk_index=0):
             hr_times=OutputHandler.get_times(
                 lr_times, self.t_enhance * len(lr_times)
             ),
+            row_inds=self.grid_inds[0][hr_slice[0]],
+            col_inds=self.grid_inds[1][hr_slice[1]],
             gids=self.gids[hr_slice[:2]],
             out_file=self.out_files[chunk_index],
             pad_width=self.fwp_slicer.extra_padding[chunk_index],
diff --git a/sup3r/postprocessing/collectors/h5.py b/sup3r/postprocessing/collectors/h5.py
@@ -10,6 +10,7 @@
 from rex.utilities.loggers import init_logger
 from scipy.spatial import KDTree
 
+from sup3r.preprocessing import Loader
 from sup3r.preprocessing.utilities import _mem_check
 from sup3r.utilities.utilities import get_dset_attrs, get_tmp_file
 from sup3r.writers import RexOutputs
@@ -93,13 +94,15 @@ def get_coordinate_indices(cls, target_meta, full_meta, threshold=1e-4):
         threshold : float
             Threshold distance for finding target coordinates within full meta
         """
-        ll2 = np.vstack(
-            (full_meta.latitude.values, full_meta.longitude.values)
-        ).T
+        ll2 = np.vstack((
+            full_meta.latitude.values,
+            full_meta.longitude.values,
+        )).T
         tree = KDTree(ll2)
-        targets = np.vstack(
-            (target_meta.latitude.values, target_meta.longitude.values)
-        ).T
+        targets = np.vstack((
+            target_meta.latitude.values,
+            target_meta.longitude.values,
+        )).T
         _, indices = tree.query(targets, distance_upper_bound=threshold)
         indices = indices[indices < len(full_meta)]
         return indices
@@ -723,7 +726,7 @@ def collect(
         cls,
         file_paths,
         out_file,
-        features,
+        features='all',
         max_workers=None,
         log_level=None,
         log_file=None,
@@ -746,23 +749,16 @@ def collect(
             ``*_{temporal_chunk_index}_{spatial_chunk_index}.h5``.
         out_file : str
             File path of final output file.
-        features : list
-            List of dsets to collect
+        features : list | str
+            List of dsets to collect. If 'all' then all datasets will be
+            collected
         max_workers : int | None
             Number of workers to use in parallel. 1 runs serial,
             None will use all available workers.
         log_level : str | None
             Desired log level, None will not initialize logging.
         log_file : str | None
             Target log file. None logs to stdout.
-        write_status : bool
-            Flag to write status file once complete if running from pipeline.
-        job_name : str
-            Job name for status file if running from pipeline.
-        pipeline_step : str, optional
-            Name of the pipeline step being run. If ``None``, the
-            ``pipeline_step`` will be set to ``"collect``, mimicking old reV
-            behavior. By default, ``None``.
         target_meta_file : str
             Path to target final meta containing coordinates to keep from the
             full file list collected meta. This can be but is not necessarily a
@@ -796,6 +792,11 @@ def collect(
             os.makedirs(os.path.dirname(out_file), exist_ok=True)
 
         collector = cls(file_paths)
+        features = (
+            Loader(collector.flist[0]).features
+            if features == 'all'
+            else features
+        )
         logger.info(
             'Collecting %s files to %s', len(collector.flist), out_file
         )
diff --git a/sup3r/postprocessing/collectors/nc.py b/sup3r/postprocessing/collectors/nc.py
@@ -9,7 +9,7 @@
 import xarray as xr
 from rex.utilities.loggers import init_logger
 
-from sup3r.preprocessing.loaders import Loader
+from sup3r.preprocessing import Loader
 from sup3r.preprocessing.names import Dimension
 from sup3r.writers import Cacher
 
@@ -32,18 +32,9 @@ def collect(
         overwrite=True,
         res_kwargs=None,
         cacher_kwargs=None,
-        is_regular_grid=True,
     ):
         """Collect data files from a dir to one output file.
 
-        TODO: For a regular grid (lat values are constant across lon and vice
-        versa) collecting lat / lon chunks is supported. For curvilinear grids
-        only collection of chunks that are split by latitude are supported.
-        This should be generalized to allow for any spatial chunking and any
-        dimension. I think this would require a new file naming scheme with a
-        spatial index for both latitude and longitude or checking each chunk
-        to see how they are split.
-
         Filename requirements:
          - Should end with ".nc"
 
@@ -61,23 +52,12 @@ def collect(
             Desired log level, None will not initialize logging.
         log_file : str | None
             Target log file. None logs to stdout.
-        write_status : bool
-            Flag to write status file once complete if running from pipeline.
-        job_name : str
-            Job name for status file if running from pipeline.
         overwrite : bool
             Whether to overwrite existing output file
         res_kwargs : dict | None
             Dictionary of kwargs to pass to xarray.open_mfdataset.
         cacher_kwargs : dict | None
             Dictionary of kwargs to pass to Cacher._write_single.
-        is_regular_grid : bool
-            Whether the data is on a regular grid. If True then spatial chunks
-            can be combined across both latitude and longitude. If False then
-            spatial chunks must all have the same longitude values to be
-            combined. If you need completely general chunk collection then
-            you should write chunks to `h5` files and use
-            :class:`sup3r.postprocessing.collectors.h5.CollectorH5`.
         """
         logger.info(f'Initializing collection for file_paths={file_paths}')
 
@@ -97,33 +77,18 @@ def collect(
             logger.info(f'overwrite=True, removing {out_file}.')
             os.remove(out_file)
 
-        spatial_chunks = collector.group_spatial_chunks()
-
         if not os.path.exists(out_file):
-            res_kwargs = res_kwargs or {
-                'combine': 'nested',
-                'concat_dim': Dimension.TIME,
-            }
-            for s_idx, sfiles in spatial_chunks.items():
-                schunk = Loader(sfiles, res_kwargs=res_kwargs)
-                spatial_chunks[s_idx] = schunk
-
-            # Set lat / lon as 1D arrays if regular grid and get the
-            # xr.Dataset _ds
-            if is_regular_grid:
-                spatial_chunks = {
-                    s_idx: schunk.set_regular_grid()._ds
-                    for s_idx, schunk in spatial_chunks.items()
-                }
-                out = xr.combine_by_coords(
-                    spatial_chunks.values(), combine_attrs='override'
-                )
-
-            else:
-                out = xr.concat(
-                    [sc._ds for sc in spatial_chunks.values()],
-                    dim=Dimension.SOUTH_NORTH,
-                )
+            dsets = list(
+                collector.group_spatial_chunks(res_kwargs=res_kwargs).values()
+            )
+
+            # Reset coords so that they are data_vars and can be combined
+            # across chunks. This is needed because coords can be 2d arrays,
+            # which can't be used to combine chunks. After combination, set
+            # them back to coords.
+            dsets = [ds.reset_coords(Dimension.coords_2d()) for ds in dsets]
+            out = xr.combine_by_coords(dsets, combine_attrs='override')
+            out = out.set_coords(Dimension.coords_2d())
 
             cacher_kwargs = cacher_kwargs or {}
             Cacher._write_single(
@@ -135,13 +100,14 @@ def collect(
 
         logger.info('Finished file collection.')
 
-    def group_spatial_chunks(self):
-        """Group same spatial chunks together so each entry has same spatial
-        footprint but different times"""
+    def group_spatial_chunks(self, res_kwargs=None):
+        """Group same spatial chunks together to get list of files with same
+        spatial footprint but different times. Return `Loader` instances for
+        each spatial chunk with combined times."""
         chunks = {}
         for file in self.flist:
             _, s_idx = self.get_chunk_indices(file)
             chunks[s_idx] = [*chunks.get(s_idx, []), file]
         for k, v in chunks.items():
-            chunks[k] = sorted(v)
+            chunks[k] = Loader(sorted(v), res_kwargs=res_kwargs)
         return chunks
diff --git a/sup3r/utilities/pytest/helpers.py b/sup3r/utilities/pytest/helpers.py
@@ -347,6 +347,8 @@ def make_collect_chunks(td, ext='h5'):
                 out_file,
                 meta_data=model_meta_data,
                 max_workers=1,
+                row_inds=np.arange(shape[0])[s1_hr],
+                col_inds=np.arange(shape[1])[s2_hr],
                 gids=gids[s1_hr, s2_hr],
             )
 
diff --git a/sup3r/writers/base.py b/sup3r/writers/base.py
@@ -567,6 +567,8 @@ def _write_output(
         invert_uv=False,
         nn_fill=False,
         max_workers=None,
+        row_inds=None,
+        col_inds=None,
         gids=None,
     ):
         """Write output to file with specified times and lats/lons"""
@@ -584,6 +586,8 @@ def write_output(
         nn_fill=False,
         max_workers=None,
         gids=None,
+        row_inds=None,
+        col_inds=None,
     ):
         """Write forward pass output to file
 
@@ -615,6 +619,14 @@ def write_output(
         gids : list
             List of coordinate indices used to label each lat lon pair and to
             help with spatial chunk data collection
+        row_inds : np.ndarray
+            Array of row indices for the full high resolution grid. This is
+            used to collect spatially contiguous data for stitching output
+            chunks back together.
+        col_inds : np.ndarray
+            Array of column indices for the full high resolution grid. This is
+            used to collect spatially contiguous data for stitching output
+            chunks back together.
         """
         lat_lon = cls.get_lat_lon(low_res_lat_lon, data.shape[:2])
         times = cls.get_times(low_res_times, data.shape[-2])
@@ -628,5 +640,7 @@ def write_output(
             invert_uv=invert_uv,
             nn_fill=nn_fill,
             max_workers=max_workers,
+            row_inds=row_inds,
+            col_inds=col_inds,
             gids=gids,
         )
diff --git a/sup3r/writers/h5.py b/sup3r/writers/h5.py
@@ -28,6 +28,8 @@ def _write_output(
         invert_uv=False,
         nn_fill=False,
         max_workers=None,
+        row_inds=None,
+        col_inds=None,
         gids=None,
     ):
         """Write forward pass output to H5 file
@@ -57,6 +59,14 @@ def _write_output(
             neighbour or cap to limits
         max_workers : int | None
             Max workers to use for inverse transform.
+        row_inds : np.ndarray
+            Array of row indices for the full high resolution grid. This is
+            used to collect spatially contiguous data for stitching output
+            chunks back together.
+        col_inds : np.ndarray
+            Array of column indices for the full high resolution grid. This is
+            used to collect spatially contiguous data for stitching output
+            chunks back together.
         gids : list
             List of coordinate indices used to label each lat lon pair and to
             help with spatial chunk data collection
@@ -84,8 +94,20 @@ def _write_output(
             if gids is not None
             else np.arange(np.prod(lat_lon.shape[:-1]))
         )
+        row_inds = (
+            row_inds if row_inds is not None else np.arange(lat_lon.shape[0])
+        )
+        col_inds = (
+            col_inds if col_inds is not None else np.arange(lat_lon.shape[1])
+        )
         meta = pd.DataFrame({
             'gid': gids.flatten(),
+            'row_ind': np.repeat(
+                row_inds[:, np.newaxis], len(col_inds), axis=1
+            ).flatten(),
+            'col_ind': np.repeat(
+                col_inds[np.newaxis, :], len(row_inds), axis=0
+            ).flatten(),
             'latitude': lat_lon[..., 0].flatten(),
             'longitude': lat_lon[..., 1].flatten(),
         })
diff --git a/sup3r/writers/nc.py b/sup3r/writers/nc.py
@@ -30,6 +30,8 @@ def _write_output(
         max_workers=None,
         invert_uv=False,
         nn_fill=False,
+        row_inds=None,
+        col_inds=None,
         gids=None,
     ):
         """Write forward pass output to NETCDF file
@@ -59,6 +61,14 @@ def _write_output(
         nn_fill : bool
             Whether to fill data outside of limits with nearest neighbour or
             cap to limits
+        row_inds : np.ndarray
+            Array of row indices for the full high resolution grid. This is
+            used to help with spatial chunk data collection and should be
+            included if the output data is spatially chunked.
+        col_inds : np.ndarray
+            Array of column indices for the full high resolution grid. This is
+            used to help with spatial chunk data collection and should be
+            included if the output data is spatially chunked.
         gids : list
             List of coordinate indices used to label each lat lon pair and to
             help with spatial chunk data collection
@@ -77,9 +87,12 @@ def _write_output(
             Dimension.LATITUDE: (Dimension.dims_2d(), lat_lon[:, :, 0]),
             Dimension.LONGITUDE: (Dimension.dims_2d(), lat_lon[:, :, 1]),
         }
-        data_vars = {}
         if gids is not None:
-            data_vars = {'gids': (Dimension.dims_2d(), gids)}
+            coords['gids'] = (Dimension.dims_2d(), gids)
+        if row_inds is not None and col_inds is not None:
+            for dim, inds in zip(Dimension.dims_2d(), [row_inds, col_inds]):
+                coords[dim] = (dim, inds)
+        data_vars = {}
         for i, f in enumerate(features):
             data_vars[f] = (
                 (Dimension.TIME, *Dimension.dims_2d()),
@@ -95,6 +108,6 @@ def _write_output(
         Cacher._write_single(
             out_file=out_file,
             data=ds,
-            features=features,
+            features=list(data_vars.keys()),
             max_workers=max_workers,
         )
diff --git a/tests/output/test_output_handling.py b/tests/output/test_output_handling.py

Original file line number	Diff line number	Diff line change
`@@ -347,6 +347,8 @@ def make_collect_chunks(td, ext='h5'):`
`347`	`347`	`out_file,`
`348`	`348`	`meta_data=model_meta_data,`
`349`	`349`	`max_workers=1,`
	`350`	`+ row_inds=np.arange(shape[0])[s1_hr],`
	`351`	`+ col_inds=np.arange(shape[1])[s2_hr],`
`350`	`352`	`gids=gids[s1_hr, s2_hr],`
`351`	`353`	`)`
`352`	`354`