diff --git a/prepare.py b/prepare.py index 06bea9165..f915672b5 100644 --- a/prepare.py +++ b/prepare.py @@ -38,7 +38,8 @@ CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "autoresearch") DATA_DIR = os.path.join(CACHE_DIR, "data") TOKENIZER_DIR = os.path.join(CACHE_DIR, "tokenizer") -BASE_URL = "https://huggingface.co/datasets/karpathy/climbmix-400b-shuffle/resolve/main" +_HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co") +BASE_URL = f"{_HF_ENDPOINT}/datasets/karpathy/climbmix-400b-shuffle/resolve/main" MAX_SHARD = 6542 # the last datashard is shard_06542.parquet VAL_SHARD = MAX_SHARD # pinned validation shard (shard_06542) VAL_FILENAME = f"shard_{VAL_SHARD:05d}.parquet"