Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog for oda_reader

## 1.4.0 (2025-12-19)
- Adds `bulk_download_dac2a()` function for bulk downloading the full DAC2A dataset.
- Auto-detects file types (parquet vs txt/csv) in bulk downloads, removing the need for the `is_txt` parameter.
- Auto-detects CSV delimiters (comma, pipe, tab, semicolon) when reading txt files from bulk downloads.
- Deprecates the `is_txt` parameter in `bulk_download_parquet()`. The parameter is still accepted for backward compatibility but emits a deprecation warning and will be removed in a future major release.
- Adds pytest and pytest-mock to dev dependencies for improved testing support.

## 1.3.5 (2025-12-19)
- Fixes `_get_dataflow_version()` to gracefully handle URLs without a version pattern instead of crashing.

Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "oda_reader"
version = "1.3.5"
version = "1.4.0"
description = "A simple package to import ODA data from the OECD's API and AidData's database"
readme = "README.md"
license = "MIT"
Expand Down Expand Up @@ -42,6 +42,8 @@ build-backend = "uv_build"
[dependency-groups]
dev = [
"pre-commit>=4.0.0",
"pytest>=9.0.2",
"pytest-mock>=3.15.1",
"ruff>=0.14.0",
]
docs = [
Expand Down
3 changes: 2 additions & 1 deletion src/oda_reader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
)
from oda_reader.crs import bulk_download_crs, download_crs, download_crs_file
from oda_reader.dac1 import download_dac1
from oda_reader.dac2a import download_dac2a
from oda_reader.dac2a import bulk_download_dac2a, download_dac2a
from oda_reader.download.query_builder import QueryBuilder
from oda_reader.multisystem import bulk_download_multisystem, download_multisystem
from oda_reader.tools import get_available_filters
Expand All @@ -38,6 +38,7 @@
"QueryBuilder",
"download_dac1",
"download_dac2a",
"bulk_download_dac2a",
"download_multisystem",
"bulk_download_multisystem",
"download_crs",
Expand Down
1 change: 0 additions & 1 deletion src/oda_reader/crs.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ def download_crs_file(
return bulk_download_parquet(
file_id=file_id,
save_to_path=save_to_path,
is_txt=True,
as_iterator=as_iterator,
)

Expand Down
54 changes: 53 additions & 1 deletion src/oda_reader/dac2a.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,38 @@
import typing
from pathlib import Path

import pandas as pd

from oda_reader._cache import cache_info
from oda_reader.common import logger
from oda_reader.download.download_tools import download
from oda_reader.download.download_tools import (
DAC2A_FLOW_URL,
bulk_download_parquet,
download,
get_bulk_file_id,
)

DATAFLOW_ID: str = "DSD_DAC2@DF_DAC2A"
DATAFLOW_VERSION: str = "1.6"


def get_full_dac2a_parquet_id() -> str:
"""Retrieve the file ID for the full DAC2A bulk download parquet file.

Queries the OECD dataflow to find the bulk download link for the complete
DAC2A dataset in dotStat format.

Returns:
str: The file ID to use with the bulk download service.

Raises:
RuntimeError: If the file ID cannot be found after maximum retries.
"""
return get_bulk_file_id(
flow_url=DAC2A_FLOW_URL, search_string="DAC2A full dataset (dotStat format)|"
)


@cache_info
def download_dac2a(
start_year: int | None = None,
Expand Down Expand Up @@ -52,3 +77,30 @@ def download_dac2a(
)

return df


def bulk_download_dac2a(
save_to_path: Path | str | None = None,
*,
as_iterator: bool = False,
) -> pd.DataFrame | None | typing.Iterator[pd.DataFrame]:
"""
Bulk download the DAC2a data from the bulk download service. The file is very large.
It is therefore strongly recommended to save it to disk. If save_to_path is not
provided, the function will return a DataFrame.

Args:
save_to_path: The path to save the file to. Optional. If not provided, a DataFrame is returned.
as_iterator: If ``True`` yields ``DataFrame`` chunks instead of a single ``DataFrame``.

Returns:
pd.DataFrame | Iterator[pd.DataFrame] | None

"""
file_id = get_full_dac2a_parquet_id()

return bulk_download_parquet(
file_id=file_id,
save_to_path=save_to_path,
as_iterator=as_iterator,
)
Loading