99import xarray as xr
1010from rex .utilities .loggers import init_logger
1111
12- from sup3r .preprocessing . loaders import Loader
12+ from sup3r .preprocessing import Loader
1313from sup3r .preprocessing .names import Dimension
1414from sup3r .writers import Cacher
1515
@@ -32,18 +32,9 @@ def collect(
3232 overwrite = True ,
3333 res_kwargs = None ,
3434 cacher_kwargs = None ,
35- is_regular_grid = True ,
3635 ):
3736 """Collect data files from a dir to one output file.
3837
39- TODO: For a regular grid (lat values are constant across lon and vice
40- versa) collecting lat / lon chunks is supported. For curvilinear grids
41- only collection of chunks that are split by latitude are supported.
42- This should be generalized to allow for any spatial chunking and any
43- dimension. I think this would require a new file naming scheme with a
44- spatial index for both latitude and longitude or checking each chunk
45- to see how they are split.
46-
4738 Filename requirements:
4839 - Should end with ".nc"
4940
@@ -61,23 +52,12 @@ def collect(
6152 Desired log level, None will not initialize logging.
6253 log_file : str | None
6354 Target log file. None logs to stdout.
64- write_status : bool
65- Flag to write status file once complete if running from pipeline.
66- job_name : str
67- Job name for status file if running from pipeline.
6855 overwrite : bool
6956 Whether to overwrite existing output file
7057 res_kwargs : dict | None
7158 Dictionary of kwargs to pass to xarray.open_mfdataset.
7259 cacher_kwargs : dict | None
7360 Dictionary of kwargs to pass to Cacher._write_single.
74- is_regular_grid : bool
75- Whether the data is on a regular grid. If True then spatial chunks
76- can be combined across both latitude and longitude. If False then
77- spatial chunks must all have the same longitude values to be
78- combined. If you need completely general chunk collection then
79- you should write chunks to `h5` files and use
80- :class:`sup3r.postprocessing.collectors.h5.CollectorH5`.
8161 """
8262 logger .info (f'Initializing collection for file_paths={ file_paths } ' )
8363
@@ -97,33 +77,18 @@ def collect(
9777 logger .info (f'overwrite=True, removing { out_file } .' )
9878 os .remove (out_file )
9979
100- spatial_chunks = collector .group_spatial_chunks ()
101-
10280 if not os .path .exists (out_file ):
103- res_kwargs = res_kwargs or {
104- 'combine' : 'nested' ,
105- 'concat_dim' : Dimension .TIME ,
106- }
107- for s_idx , sfiles in spatial_chunks .items ():
108- schunk = Loader (sfiles , res_kwargs = res_kwargs )
109- spatial_chunks [s_idx ] = schunk
110-
111- # Set lat / lon as 1D arrays if regular grid and get the
112- # xr.Dataset _ds
113- if is_regular_grid :
114- spatial_chunks = {
115- s_idx : schunk .set_regular_grid ()._ds
116- for s_idx , schunk in spatial_chunks .items ()
117- }
118- out = xr .combine_by_coords (
119- spatial_chunks .values (), combine_attrs = 'override'
120- )
121-
122- else :
123- out = xr .concat (
124- [sc ._ds for sc in spatial_chunks .values ()],
125- dim = Dimension .SOUTH_NORTH ,
126- )
81+ dsets = list (
82+ collector .group_spatial_chunks (res_kwargs = res_kwargs ).values ()
83+ )
84+
85+ # Reset coords so that they are data_vars and can be combined
86+ # across chunks. This is needed because coords can be 2d arrays,
87+ # which can't be used to combine chunks. After combination, set
88+ # them back to coords.
89+ dsets = [ds .reset_coords (Dimension .coords_2d ()) for ds in dsets ]
90+ out = xr .combine_by_coords (dsets , combine_attrs = 'override' )
91+ out = out .set_coords (Dimension .coords_2d ())
12792
12893 cacher_kwargs = cacher_kwargs or {}
12994 Cacher ._write_single (
@@ -135,13 +100,14 @@ def collect(
135100
136101 logger .info ('Finished file collection.' )
137102
138- def group_spatial_chunks (self ):
139- """Group same spatial chunks together so each entry has same spatial
140- footprint but different times"""
103+ def group_spatial_chunks (self , res_kwargs = None ):
104+ """Group same spatial chunks together to get list of files with same
105+ spatial footprint but different times. Return `Loader` instances for
106+ each spatial chunk with combined times."""
141107 chunks = {}
142108 for file in self .flist :
143109 _ , s_idx = self .get_chunk_indices (file )
144110 chunks [s_idx ] = [* chunks .get (s_idx , []), file ]
145111 for k , v in chunks .items ():
146- chunks [k ] = sorted (v )
112+ chunks [k ] = Loader ( sorted (v ), res_kwargs = res_kwargs )
147113 return chunks
0 commit comments