Skip to content

Move to adaptor backend #298

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
9dd8beb
Support adaptor in prepare_pin_version
nathanjmcdougall Aug 13, 2024
040da5e
Use adaptor in save_data
nathanjmcdougall Aug 13, 2024
4ba393d
Use adaptor for default_title
nathanjmcdougall Aug 13, 2024
7898ce7
underscore prefix for _adaptors.py; abstracting df_type in default_title
nathanjmcdougall Aug 13, 2024
4a3ea01
Removing duplication in _obj_name definition
nathanjmcdougall Aug 13, 2024
007ad3a
Use adaptor in _create_meta
nathanjmcdougall Aug 13, 2024
d577b02
Pass pyright
nathanjmcdougall Aug 13, 2024
3aaabbb
Fix broken import
nathanjmcdougall Aug 13, 2024
56c3285
Refactoring type hints to avoid use of Self
nathanjmcdougall Aug 20, 2024
0171d72
Remove singleton Union
nathanjmcdougall Aug 20, 2024
fe6092f
Add databackend as a dependency
nathanjmcdougall Aug 27, 2024
1289134
Merge branch 'main' into feature/move-to-adaptor-backend
nathanjmcdougall Dec 17, 2024
1d5c47f
dev: add ruff to pyproject.toml
machow Apr 1, 2025
d0fa9c9
feat: allow save_data to accept an Adaptor
machow Apr 1, 2025
81f6779
Remove unnecessary underscores
nathanjmcdougall Apr 1, 2025
1540500
Remove misleading/unnecessary ClassVar declaration
nathanjmcdougall Apr 1, 2025
dd49569
Merge branch 'feature/move-to-adaptor-backend' of https://github.com/…
nathanjmcdougall Apr 1, 2025
daa4239
Separate write_json from to_json (CQS)
nathanjmcdougall Apr 1, 2025
f11141a
Move calls to create_adapter to hide them at a lower level
nathanjmcdougall Apr 1, 2025
13d356e
Add some tests
nathanjmcdougall Apr 1, 2025
82ba58a
Merge branch 'rstudio:main' into feature/move-to-adaptor-backend
nathanjmcdougall Apr 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions pins/_adaptors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from __future__ import annotations

import json
from abc import abstractmethod
from typing import TYPE_CHECKING, Any, ClassVar, TypeAlias, overload

from databackend import AbstractBackend

if TYPE_CHECKING:
import pandas as pd

PandasDataFrame: TypeAlias = pd.DataFrame
DataFrame: TypeAlias = PandasDataFrame


class AbstractPandasFrame(AbstractBackend):
_backends = [("pandas", "DataFrame")]


AbstractDF: TypeAlias = AbstractPandasFrame


class Adaptor:
def __init__(self, data: Any) -> None:
self._d = data

def write_json(self, file: str) -> None:
with open(file, "w") as f:
f.write(self.to_json())

def to_json(self) -> str:
import json

return json.dumps(self._d)

def write_joblib(self, file: str) -> None:
import joblib

joblib.dump(self._d, file)

def write_csv(self, file: str) -> None:
msg = f"Writing to CSV is not supported for {type(self._d)}"
raise NotImplementedError(msg)

def write_parquet(self, file: str) -> None:
msg = f"Writing to Parquet is not supported for {type(self._d)}"
raise NotImplementedError(msg)

def write_feather(self, file: str) -> None:
msg = f"Writing to Feather is not supported for {type(self._d)}"
raise NotImplementedError(msg)

@property
def data_preview(self) -> str:
# note that the R library uses jsonlite::toJSON
import json

# TODO(compat): set display none in index.html
return json.dumps({})

def default_title(self, name: str) -> str:
# TODO(compat): title says CSV rather than data.frame
# see https://github.com/machow/pins-python/issues/5
return f"{name}: a pinned {self._obj_name}"

@property
def _obj_name(self) -> str:
return f"{type(self._d).__qualname__} object"


class DFAdaptor(Adaptor):
_d: ClassVar[DataFrame]

def __init__(self, data: DataFrame) -> None:
super().__init__(data)

@property
def df_type(self) -> str:
# Consider over-riding this for specialized dataframes
return "DataFrame"

@property
@abstractmethod
def columns(self) -> list[Any]: ...

@property
@abstractmethod
def shape(self) -> tuple[int, int]: ...

@abstractmethod
def head(self, n: int) -> DFAdaptor: ...

@property
def data_preview(self) -> str:
# TODO(compat) is 100 hard-coded?
# Note that we go df -> json -> dict, to take advantage of type conversions in the dataframe library
data: list[dict[Any, Any]] = json.loads(self.head(100).to_json())
columns = [
{"name": [col], "label": [col], "align": ["left"], "type": [""]}
for col in self.columns
]

# this reproduces R pins behavior, by omitting entries that would be null
data_no_nulls = [{k: v for k, v in row.items() if v is not None} for row in data]

return json.dumps({"data": data_no_nulls, "columns": columns})

@property
def _obj_name(self) -> str:
row, col = self.shape
return f"{row} x {col} {self.df_type}"


class PandasAdaptor(DFAdaptor):
_d: ClassVar[PandasDataFrame]

def __init__(self, data: AbstractPandasFrame) -> None:
super().__init__(data)

@property
def columns(self) -> list[Any]:
return self._d.columns.tolist()

@property
def shape(self) -> tuple[int, int]:
return self._d.shape

def head(self, n: int) -> PandasAdaptor:
return PandasAdaptor(self._d.head(n))

def to_json(self) -> str:
return self._d.to_json(orient="records")

def write_csv(self, file: str) -> None:
self._d.to_csv(file, index=False)

def write_parquet(self, file: str) -> None:
self._d.to_parquet(file)

def write_feather(self, file: str) -> None:
self._d.to_feather(file)


@overload
def create_adaptor(obj: DataFrame) -> DFAdaptor: ...
@overload
def create_adaptor(obj: Any) -> Adaptor: ...
def create_adaptor(obj: Any | DataFrame) -> Adaptor | DFAdaptor:
if isinstance(obj, AbstractPandasFrame):
return PandasAdaptor(obj)
elif isinstance(obj, Adaptor):
return obj
else:
return Adaptor(obj)
42 changes: 8 additions & 34 deletions pins/boards.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
from datetime import datetime, timedelta
from io import IOBase
from pathlib import Path
from typing import Protocol
from typing import Any, Protocol

from importlib_resources import files
from importlib_resources.abc import Traversable

from ._adaptors import Adaptor, create_adaptor
from .cache import PinsCache
from .config import get_allow_rsc_short_name
from .drivers import REQUIRES_SINGLE_FILE, default_title, load_data, load_file, save_data
Expand All @@ -25,6 +26,8 @@

_log = logging.getLogger(__name__)

_ = default_title # Keep this import for backward compatibility


class IFileSystem(Protocol):
protocol: str | list
Expand Down Expand Up @@ -715,7 +718,7 @@ def prepare_pin_version(
def _create_meta(
self,
pin_dir_path,
x,
x: Adaptor | Any,
name: str | None = None,
type: str | None = None,
title: str | None = None,
Expand All @@ -732,7 +735,7 @@ def _create_meta(
raise NotImplementedError("Type argument is required.")

if title is None:
title = default_title(x, name)
title = create_adaptor(x).default_title(name)

# create metadata from object on disk ---------------------------------
# save all pin data to a temporary folder (including data.txt), so we
Expand Down Expand Up @@ -1223,46 +1226,17 @@ def prepare_pin_version(self, pin_dir_path, x, name: str | None, *args, **kwargs
# render index.html ------------------------------------------------

all_files = [meta.file] if isinstance(meta.file, str) else meta.file
pin_files = ", ".join(f"""<a href="{x}">{x}</a>""" for x in all_files)
pin_files = ", ".join(f"""<a href="{file}">{file}</a>""" for file in all_files)

context = {
"date": meta.version.created.replace(microsecond=0),
"pin_name": self.path_to_pin(name),
"pin_files": pin_files,
"pin_metadata": meta,
"board_deparse": board_deparse(self),
"data_preview": create_adaptor(x).data_preview,
}

# data preview ----

# TODO: move out data_preview logic? Can we draw some limits here?
# note that the R library uses jsonlite::toJSON

import json

import pandas as pd

if isinstance(x, pd.DataFrame):
# TODO(compat) is 100 hard-coded?
# Note that we go df -> json -> dict, to take advantage of pandas type conversions
data = json.loads(x.head(100).to_json(orient="records"))
columns = [
{"name": [col], "label": [col], "align": ["left"], "type": [""]}
for col in x
]

# this reproduces R pins behavior, by omitting entries that would be null
data_no_nulls = [
{k: v for k, v in row.items() if v is not None} for row in data
]

context["data_preview"] = json.dumps(
{"data": data_no_nulls, "columns": columns}
)
else:
# TODO(compat): set display none in index.html
context["data_preview"] = json.dumps({})

# do not show r code if not round-trip friendly
if meta.type in ["joblib"]:
context["show_r_style"] = "display:none"
Expand Down
64 changes: 19 additions & 45 deletions pins/drivers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from collections.abc import Sequence
from pathlib import Path
from typing import Any

from pins._adaptors import Adaptor, create_adaptor

from .config import PINS_ENV_INSECURE_READ, get_allow_pickle_read
from .errors import PinsInsecureReadError
Expand All @@ -13,15 +16,6 @@
REQUIRES_SINGLE_FILE = frozenset(["csv", "joblib"])


def _assert_is_pandas_df(x, file_type: str) -> None:
import pandas as pd

if not isinstance(x, pd.DataFrame):
raise NotImplementedError(
f"Currently only pandas.DataFrame can be saved as type {file_type!r}."
)


def load_path(filename: str, path_to_version, pin_type=None):
# file path creation ------------------------------------------------------
if pin_type == "table":
Expand Down Expand Up @@ -126,7 +120,7 @@ def load_data(


def save_data(
obj, fname, pin_type=None, apply_suffix: bool = True
obj: "Adaptor | Any", fname, pin_type=None, apply_suffix: bool = True
) -> "str | Sequence[str]":
# TODO: extensible saving with deferred importing
# TODO: how to encode arguments to saving / loading drivers?
Expand All @@ -135,6 +129,11 @@ def save_data(
# as argument to board, and then type dispatchers for explicit cases
# of saving / loading objects different ways.

if isinstance(obj, Adaptor):
adaptor, obj = obj, obj._d
else:
adaptor = create_adaptor(obj)

if apply_suffix:
if pin_type == "file":
suffix = "".join(Path(obj).suffixes)
Expand All @@ -149,39 +148,22 @@ def save_data(
final_name = f"{fname}{suffix}"

if pin_type == "csv":
_assert_is_pandas_df(obj, file_type=type)

obj.to_csv(final_name, index=False)

adaptor.write_csv(final_name)
elif pin_type == "arrow":
# NOTE: R pins accepts the type arrow, and saves it as feather.
# we allow reading this type, but raise an error for writing.
_assert_is_pandas_df(obj, file_type=type)

obj.to_feather(final_name)

adaptor.write_feather(final_name)
elif pin_type == "feather":
_assert_is_pandas_df(obj, file_type=type)

raise NotImplementedError(
msg = (
'Saving data as type "feather" no longer supported. Use type "arrow" instead.'
)

raise NotImplementedError(msg)
elif pin_type == "parquet":
_assert_is_pandas_df(obj, file_type=type)

obj.to_parquet(final_name)

adaptor.write_parquet(final_name)
elif pin_type == "joblib":
import joblib

joblib.dump(obj, final_name)

adaptor.write_joblib(final_name)
elif pin_type == "json":
import json

json.dump(obj, open(final_name, "w"))

adaptor.write_json(final_name)
elif pin_type == "file":
import contextlib
import shutil
Expand All @@ -202,14 +184,6 @@ def save_data(
return final_name


def default_title(obj, name):
import pandas as pd

if isinstance(obj, pd.DataFrame):
# TODO(compat): title says CSV rather than data.frame
# see https://github.com/machow/pins-python/issues/5
shape_str = " x ".join(map(str, obj.shape))
return f"{name}: a pinned {shape_str} DataFrame"
else:
obj_name = type(obj).__qualname__
return f"{name}: a pinned {obj_name} object"
def default_title(obj: Any, name: str) -> str:
# Kept for backward compatibility only.
return create_adaptor(obj).default_title(name)
Loading
Loading