From 0a54422b2579cea8e68d258f82859f8f27b41d54 Mon Sep 17 00:00:00 2001 From: Steph Bench Date: Wed, 15 Mar 2023 09:56:21 +0100 Subject: [PATCH 1/3] Add kerchunk-nc cli, initial work --- kerchunk/cli/__init__.py | 0 kerchunk/cli/chunk_nc.py | 105 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 kerchunk/cli/__init__.py create mode 100644 kerchunk/cli/chunk_nc.py diff --git a/kerchunk/cli/__init__.py b/kerchunk/cli/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kerchunk/cli/chunk_nc.py b/kerchunk/cli/chunk_nc.py new file mode 100644 index 00000000..9036a0d0 --- /dev/null +++ b/kerchunk/cli/chunk_nc.py @@ -0,0 +1,105 @@ +import json +import logging +from pathlib import Path +from typing import List + +import click +import fsspec + +from kerchunk.hdf import SingleHdf5ToZarr +from kerchunk.combine import MultiZarrToZarr + +logger = logging.getLogger("kercli") + + +class NetcdfChunker: + dataset_name: str = "mydataset" + input: List[str] + input_format: str = "nc" + input_fs_args: dict = {"anon": True} + json_dir: Path = Path("json") + zarr_output: Path = Path("zarr") + force_scan: bool = False + + def __init__(self, *args, **kwargs): + self.__dict__.update(**kwargs) + self.json_dir = Path(self.json_dir) + self.zarr_output = Path(self.zarr_output) + + @property + def json_dataset_dir(self) -> Path: + fp = self.json_dir / self.dataset_name + if not fp.exists(): fp.mkdir(parents=True) + return fp + + def run(self): + self.scan_them_all() + self.consolidate() + + def scan_them_all(self): + # Scan all input file + for fp in self.input: + fp_json = self.json_dataset_dir / Path(fp).parent.as_posix() / Path(fp).with_suffix(".json").name + fp_json.parent.mkdir(parents=True, exist_ok=True) + + if fp_json.exists() and not self.force_scan: + logger.info(f"Scanning {fp} : {fp_json} already exists, skipping") + continue + + logger.info(f"Scanning {fp} ...") + with fsspec.open(fp, **self.input_fs_args) as fd: + try: + h5chunk = SingleHdf5ToZarr(fd, fp, inline_threshold=100) + json_data = h5chunk.translate() + except OSError as e: + logger.error(str(e)) + continue + + logger.info(f"Saving to {fp_json}") + with fp_json.open("w") as fd: + json.dump(json_data, fd, indent=2) + + def consolidate(self): + fp_json_list = [str(fp) for fp in self.json_dataset_dir.glob("**/*.json")] + logger.info(f"Data loaded from {self.json_dataset_dir} : {len(fp_json_list)} found") + mzz = MultiZarrToZarr( + fp_json_list, + concat_dims=["time0"] + ) + + fp_zarr = self.zarr_output / f"{self.dataset_name}.zarr" + logger.info(f"Consolidating to {fp_zarr} ...") + mzz.translate(filename=str(fp_zarr)) + + +def str_to_json(ctx, param, value): + if value and not isinstance(value, dict): + value = json.loads(value) + return value + +@click.command() +@click.option("--name", help="Dataset name", default="mydataset") +@click.option("--input", "-i", + help="Input file url, readable by fsspec", required=True, + multiple=True) +@click.option("--input-format", default="nc") +@click.option("--input-fs-args", help="Arguments that will be passed to fsspec.open()", + type=click.UNPROCESSED, callback=str_to_json, default={'anon': True}) +@click.option("--json-dir", help="Where to store scan output as json", default=Path("json")) +@click.option("--zarr-output", help="Output of fully merged kerchunk zarr file", default=Path("zarr")) +@click.option("--force-scan", + help="Force scanning input file, even if json file exists", + is_flag=True, default=False) +@click.option('--verbose', '-v', is_flag=True) +def cli(verbose: bool, **kwargs): + logging.basicConfig(level=logging.INFO) + if verbose: + # Show what's going on with fsspec + _logger = logging.getLogger("fsspec") + _logger.setLevel(logging.DEBUG) + c = NetcdfChunker(**kwargs) + c.run() + + +if __name__ == "__main__": + cli() From 28b68d29c4f400430fb03f285b607adcd5e22ed7 Mon Sep 17 00:00:00 2001 From: Steph Bench Date: Wed, 15 Mar 2023 10:03:53 +0100 Subject: [PATCH 2/3] Add entry point and click requirements --- kerchunk/cli/chunk_nc.py | 5 +++-- requirements.txt | 1 + setup.py | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/kerchunk/cli/chunk_nc.py b/kerchunk/cli/chunk_nc.py index 9036a0d0..a330e785 100644 --- a/kerchunk/cli/chunk_nc.py +++ b/kerchunk/cli/chunk_nc.py @@ -78,13 +78,13 @@ def str_to_json(ctx, param, value): return value @click.command() -@click.option("--name", help="Dataset name", default="mydataset") +@click.option("--name", help="Dataset name", default="mydataset", show_default=True) @click.option("--input", "-i", help="Input file url, readable by fsspec", required=True, multiple=True) @click.option("--input-format", default="nc") @click.option("--input-fs-args", help="Arguments that will be passed to fsspec.open()", - type=click.UNPROCESSED, callback=str_to_json, default={'anon': True}) + type=click.UNPROCESSED, callback=str_to_json, default={'anon': True}, show_default=True) @click.option("--json-dir", help="Where to store scan output as json", default=Path("json")) @click.option("--zarr-output", help="Output of fully merged kerchunk zarr file", default=Path("zarr")) @click.option("--force-scan", @@ -92,6 +92,7 @@ def str_to_json(ctx, param, value): is_flag=True, default=False) @click.option('--verbose', '-v', is_flag=True) def cli(verbose: bool, **kwargs): + """ Cli for ker-chunking local or remote NetCDF files""" logging.basicConfig(level=logging.INFO) if verbose: # Show what's going on with fsspec diff --git a/requirements.txt b/requirements.txt index 1a0e1b86..7b26c4f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ numcodecs numpy ujson zarr +click \ No newline at end of file diff --git a/setup.py b/setup.py index a29c3fa1..b26048c4 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,9 @@ "FITSVarBintable = kerchunk.codecs:VarArrCodec", "record_member = kerchunk.codecs.RecordArrayMember", ], + 'console_scripts': [ + 'kerchunk-nc = kerchunk.cli.chunk_nc:cli', + ], }, zip_safe=False, ) From f4f5e18573b1c49913bb2fa622bf7b07fa696fa0 Mon Sep 17 00:00:00 2001 From: Steph Bench Date: Thu, 16 Mar 2023 10:00:16 +0100 Subject: [PATCH 3/3] Working on ... --- kerchunk/cli/chunk_nc.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/kerchunk/cli/chunk_nc.py b/kerchunk/cli/chunk_nc.py index a330e785..f0e99f89 100644 --- a/kerchunk/cli/chunk_nc.py +++ b/kerchunk/cli/chunk_nc.py @@ -5,6 +5,7 @@ import click import fsspec +from fsspec.core import url_to_fs from kerchunk.hdf import SingleHdf5ToZarr from kerchunk.combine import MultiZarrToZarr @@ -16,7 +17,7 @@ class NetcdfChunker: dataset_name: str = "mydataset" input: List[str] input_format: str = "nc" - input_fs_args: dict = {"anon": True} + input_fs_args: dict = {} json_dir: Path = Path("json") zarr_output: Path = Path("zarr") force_scan: bool = False @@ -33,12 +34,22 @@ def json_dataset_dir(self) -> Path: return fp def run(self): + self.get_uri_list() self.scan_them_all() self.consolidate() + def get_uri_list(self) -> List[str]: + _uri_list = [] + for fs_glob in self.input: + fs, _ = url_to_fs(fs_glob) + for input_uri in fs.glob(fs_glob): + _uri_list.append(input_uri) + _uri_list = list(set(_uri_list)) + return sorted(_uri_list) + def scan_them_all(self): # Scan all input file - for fp in self.input: + for fp in self.get_uri_list(): fp_json = self.json_dataset_dir / Path(fp).parent.as_posix() / Path(fp).with_suffix(".json").name fp_json.parent.mkdir(parents=True, exist_ok=True) @@ -64,7 +75,7 @@ def consolidate(self): logger.info(f"Data loaded from {self.json_dataset_dir} : {len(fp_json_list)} found") mzz = MultiZarrToZarr( fp_json_list, - concat_dims=["time0"] + concat_dims=["analysis_time", "step"] ) fp_zarr = self.zarr_output / f"{self.dataset_name}.zarr" @@ -78,13 +89,13 @@ def str_to_json(ctx, param, value): return value @click.command() -@click.option("--name", help="Dataset name", default="mydataset", show_default=True) +@click.option("--dataset-name", help="Dataset name", default="mydataset", show_default=True) @click.option("--input", "-i", help="Input file url, readable by fsspec", required=True, multiple=True) @click.option("--input-format", default="nc") @click.option("--input-fs-args", help="Arguments that will be passed to fsspec.open()", - type=click.UNPROCESSED, callback=str_to_json, default={'anon': True}, show_default=True) + type=click.UNPROCESSED, callback=str_to_json, default={}, show_default=True) @click.option("--json-dir", help="Where to store scan output as json", default=Path("json")) @click.option("--zarr-output", help="Output of fully merged kerchunk zarr file", default=Path("zarr")) @click.option("--force-scan",