Skip to content

Commit 1b76edc

Browse files
authored
Version 0.0.11 (#65)
* adding version * formatting * added versions to metadata of generated schemas * testing generated_with * test for optional columns * renamed na_limit to na_pct_below * adding to changelog * bumping version in pyproject.toml * formatted * import version only when needed * using importlib.metadata * fixing self.na_limit * fix * fixing version test * update cli * generate metadata upon initialisation * fixing package name * raising * substitute importlib_metadata if python=3.7 * test cli update * updating changelog * stdout * stdout * small refactor of extension check * out * removing qoute * cli
1 parent e376b13 commit 1b76edc

19 files changed

+225
-51
lines changed

changelog.md

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Changelog
22

3+
v0.0.11:
4+
- Version in metadata
5+
- adds `dfschema` and `pandas` version in metadata upon generation (Later will worn if Schema is initialized from json, generated by later version)
6+
- Renamed `na_limit` to `na_pct_below` to make it unambiguous (with backward support)
7+
- Added `optional=True` flag for columns. If true, does not raise exception if column is not present
8+
- added `dfschema update {existing_schema} {output_schema}` command to upgrade schemas
9+
310
v0.0.10:
411
- relaxed Pydantic requirement to `>=1.9`
512

@@ -31,7 +38,7 @@ v0.0.6:
3138
- added pre-commit install to the repo
3239
- Some benchmarking
3340
- renamed `dfs.validate_df` to `dfs.validate`
34-
41+
3542
v0.0.5: fix column dtype generation/validation bug
3643

3744
## Pre-Publication

dfschema/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
DataFrameSummaryError,
88
)
99

10+
__version__ = "0.0.11"
1011

1112
__all__ = [
1213
"validate",
@@ -16,4 +17,5 @@
1617
"DataFrameSchemaError",
1718
"DataFrameValidationError",
1819
"DataFrameSummaryError",
20+
"__version__",
1921
]

dfschema/cli.py

+19
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,22 @@ def generate(
100100
raise ValueError(
101101
f"Unsupported extension: {format}, should be one of [json, yaml]"
102102
)
103+
104+
105+
@app.command()
106+
def update(
107+
input: Path = typer.Argument(..., help="input schema file"),
108+
output: Path = typer.Argument(..., help="output schema file"),
109+
):
110+
allowed = (".json", ".yaml", ".yml")
111+
for name, f in zip(("input", "output"), (input, output)):
112+
if f.suffix not in allowed:
113+
raise ValueError(
114+
f"Argument `{name}` should end with one of {allowed}, got {f}"
115+
)
116+
117+
schema = DfSchema.from_file(input)
118+
protocol_version = schema.metadata.protocol_version
119+
print(f"Writing with `{protocol_version}` to `{output}`")
120+
121+
schema.to_file(output)

dfschema/core/column.py

+25-9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import sys
22
from datetime import date, datetime
3-
from typing import List, Optional, FrozenSet, Union, Tuple # , Pattern
3+
from typing import List, Optional, FrozenSet, Union, Tuple, Set # , Pattern
44
from warnings import warn
55

66
import pandas as pd
@@ -21,6 +21,7 @@
2121
def _validate_column_presence(
2222
df: pd.DataFrame,
2323
column_names: Tuple[str],
24+
optional_columns: Set[str] = set(),
2425
additionalColumns: bool = True,
2526
exactColumnOrder: bool = False,
2627
) -> None:
@@ -33,7 +34,11 @@ def _validate_column_presence(
3334
text = f"Some columns should not be in dataframe: {other_cols}"
3435
raise DataFrameValidationError(text)
3536

36-
lac_cols = [col for col in column_names if col not in df.columns]
37+
lac_cols = [
38+
col
39+
for col in column_names
40+
if (col not in df.columns) and (col not in optional_columns)
41+
]
3742
if len(lac_cols) != 0:
3843
text = f"Some columns are not in dataframe: {lac_cols}"
3944
raise DataFrameValidationError(text)
@@ -140,7 +145,13 @@ def validate_column(self, series: pd.Series, root, col_name: Optional[str] = Non
140145

141146

142147
class Categorical(BaseModel): # type: ignore
143-
value_set: Optional[Union[FrozenSet[int], FrozenSet[float], FrozenSet[str],]] = None
148+
value_set: Optional[
149+
Union[
150+
FrozenSet[int],
151+
FrozenSet[float],
152+
FrozenSet[str],
153+
]
154+
] = None
144155
mode: Optional[Literal["oneof", "exact_set", "include"]] = None
145156
unique: bool = Field(
146157
False, description="if true, the column must contain only unique values"
@@ -188,16 +199,21 @@ def validate_column(self, series: pd.Series, col_name: str, root) -> None:
188199
class ColSchema(BaseModel):
189200
name: str = Field(..., description="Name of the column")
190201
dtype: Optional[DtypeLiteral] = Field(None, description="Data type of the column") # type: ignore
191-
202+
optional: Optional[bool] = Field(
203+
None,
204+
description="If true, will not raise exception if columns is not present in dataframe",
205+
)
192206
# accepted for value limitation checks
193207
_val_accepted_types = {None, "int", "float", "datetime64[ns]"}
194208

195-
na_limit: Optional[float] = Field(
209+
na_pct_below: Optional[float] = Field(
196210
None,
197211
ge=0,
198212
lt=1.0,
199-
description="limit of missing values. If set to true, will raise if all values are empty. If set to a number, will raise if more than that fraction of values are empty (Nan)",
213+
description="limit of missing values. If set to true, will raise if all values are empty. If set to a number, will raise if more than given perecnt of values are empty (Nan)",
214+
alias="na_limit",
200215
)
216+
201217
value_limits: Optional[ValueLimits] = Field(
202218
None, description="Value limits for the column"
203219
)
@@ -257,8 +273,8 @@ def _validate_dtype(self, series: pd.Series) -> None:
257273
def _validate_na_limit(self, series: pd.Series) -> None:
258274
na_fraction = series.isnull().mean()
259275

260-
if na_fraction > self.na_limit: # type: ignore
261-
text = f"Column `{self.name}` has too many NAs: {na_fraction}, should be <= {self.na_limit}"
276+
if na_fraction > self.na_pct_below: # type: ignore
277+
text = f"Column `{self.name}` has too many NAs: {na_fraction}, should be <= {self.na_pct_below}"
262278
raise DataFrameValidationError(text)
263279

264280
@exception_collector
@@ -294,7 +310,7 @@ def validate_column(self, series: pd.Series, root) -> None:
294310
if self.dtype:
295311
self._validate_dtype(series, root=root)
296312

297-
if self.na_limit:
313+
if self.na_pct_below:
298314
self._validate_na_limit(series, root=root)
299315

300316
if self.value_limits:

dfschema/core/config.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import sys
2+
3+
if sys.version_info >= (3, 8):
4+
from typing import Final
5+
else:
6+
from typing_extensions import Final
7+
8+
CURRENT_PROTOCOL_VERSION: Final = 2.0

dfschema/core/core.py

+17-29
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,23 @@
1-
from datetime import date
21
from typing import Callable, Optional, Union, List
32
import json
43
from pathlib import Path
54

65

76
import pandas as pd
87
from pydantic import BaseModel, Extra, Field, PrivateAttr
9-
import sys
108

119
from .column import ColSchema, _validate_column_presence
1210
from .exceptions import DataFrameSchemaError, DataFrameSummaryError, SubsetSummaryError
1311
from .shape import ShapeSchema
1412
from .legacy import infer_protocol_version, LegacySchemaRegistry
1513
from .generate import generate_schema_dict_from_df
14+
from .metadata import MetaData
15+
from .config import CURRENT_PROTOCOL_VERSION
1616

1717
# from .utils import SchemaEncoder
1818
# from .base_config import BaseConfig
1919

2020

21-
if sys.version_info >= (3, 8):
22-
from typing import Final
23-
else:
24-
from typing_extensions import Final
25-
26-
CURRENT_PROTOCOL_VERSION: Final = 2.0
27-
28-
29-
class MetaData(BaseModel):
30-
protocol_version: float = Field(
31-
CURRENT_PROTOCOL_VERSION, description="protocol version of the schema"
32-
)
33-
version: Optional[str] = Field(
34-
date.today().strftime("%Y-%m-%d"),
35-
description="version of the schema",
36-
example="2022-06-12",
37-
)
38-
39-
custom_settings: Optional[dict] = Field(
40-
None, description="custom settings. does not affect any logic"
41-
)
42-
43-
4421
class DfSchema(BaseModel): # type: ignore
4522
"""Main class of the package
4623
@@ -104,8 +81,14 @@ def _summary_error(self) -> DataFrameSummaryError:
10481

10582
def validate_column_presence(self, df: pd.DataFrame) -> None:
10683
schema_col_names = {col.name for col in self.columns} # type: ignore
84+
optional_columns = {col.name for col in self.columns if col.optional}
85+
10786
_validate_column_presence(
108-
df, schema_col_names, additionalColumns=self.additionalColumns, root=self
87+
df,
88+
schema_col_names,
89+
optional_columns=optional_columns,
90+
additionalColumns=self.additionalColumns,
91+
root=self,
10992
)
11093

11194
def validate_df(self, df: pd.DataFrame, summary: bool = True) -> None:
@@ -230,7 +213,6 @@ def to_file(self, path: Union[str, Path]) -> None:
230213
path = Path(path)
231214

232215
try:
233-
234216
if path.suffix == ".json":
235217
schema_json = self.json(exclude_none=True, indent=4)
236218
with path.open("w") as f:
@@ -254,7 +236,10 @@ def to_file(self, path: Union[str, Path]) -> None:
254236
raise DataFrameSchemaError(f"Error wriging schema to file {path}") from e
255237

256238
@classmethod
257-
def from_dict(cls, dict_: dict,) -> "DfSchema":
239+
def from_dict(
240+
cls,
241+
dict_: dict,
242+
) -> "DfSchema":
258243
"""create DfSchema from dict.
259244
260245
same as `DfSchema(**dict_)`, but will also migrate old protocol schemas if necessary.
@@ -329,7 +314,10 @@ class SubsetSchema(BaseModel, extra=Extra.forbid, arbitrary_types_allowed=True):
329314
predicate to select subset.
330315
- If string, will be interpreted as query for `df.query()`.
331316
- If dict, keys should be column names, values should be values to exactly match"""
332-
predicate: Union[dict, str,] = Field(..., description=_predicate_description)
317+
predicate: Union[
318+
dict,
319+
str,
320+
] = Field(..., description=_predicate_description)
333321

334322
shape: Optional[ShapeSchema] = Field(None, description="shape expectations")
335323
columns: Optional[List[ColSchema]] = Field([], description="columns expectations")

dfschema/core/legacy/v1.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ class Config:
5454
allow_population_by_field_name = True
5555

5656
version: Optional[str] = Field(
57-
None, description="version of the schema", example="2022-06-12",
57+
None,
58+
description="version of the schema",
59+
example="2022-06-12",
5860
)
5961

6062
protocol_version: float = Field(1.0, description="version of the protocol")

dfschema/core/metadata.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import sys
2+
from datetime import date
3+
from typing import Optional
4+
5+
from pydantic import BaseModel, Field
6+
7+
from .config import CURRENT_PROTOCOL_VERSION
8+
9+
10+
class Generated_With(BaseModel):
11+
@property
12+
def dfschema(self) -> str:
13+
if sys.version_info >= (3, 8):
14+
from importlib.metadata import version
15+
else:
16+
from importlib_metadata import version
17+
18+
return version("dfschema")
19+
20+
@property
21+
def pandas(self) -> str:
22+
import pandas as pd
23+
24+
return pd.__version__
25+
26+
27+
class MetaData(BaseModel):
28+
protocol_version: float = Field(
29+
CURRENT_PROTOCOL_VERSION, description="protocol version of the schema"
30+
)
31+
version: Optional[str] = Field(
32+
date.today().strftime("%Y-%m-%d"),
33+
description="version of the schema",
34+
example="2022-06-12",
35+
)
36+
37+
generated_with: Generated_With = Field(
38+
Generated_With(), description="version of packages schema was generated with"
39+
)
40+
custom_settings: Optional[dict] = Field(
41+
None, description="custom settings. does not affect any logic"
42+
)

dfschema/utils.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import pandas as pd
2-
from datetime import date
32

43
from .core.exceptions import DataFrameValidationError
54

@@ -51,13 +50,12 @@ def generate_scheme(
5150
exactColumnOrder: bool = False,
5251
na_thlds: bool = True,
5352
minmax: bool = True,
54-
version: str = f"{date.today():%Y-%m-%d}",
5553
) -> dict:
56-
"""generates dummy scheme over given dataframe"""
54+
"""generates dummy schema over given dataframe"""
55+
5756
schema: dict = {
5857
"additionalColumns": additionalColumns,
5958
"exactColumnOrder": exactColumnOrder,
60-
"version": version,
6159
}
6260

6361
cols: dict = {"dtype": df.dtypes.astype(str).to_dict()}

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "dfschema"
3-
version = "0.0.10"
3+
version = "0.0.11"
44
description = "lightweight pandas.DataFrame schema"
55
authors = ["Philipp <[email protected]>"]
66
readme = "README.md"

tests/conftest.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,14 @@ def df3():
3333
@pytest.fixture()
3434
def df4():
3535
df = pd.DataFrame(
36-
{"x": [1, 2, 3, 4], "y": ["foo", "bar", "baz", None], "z": ["2022-10-23",] * 4}
36+
{
37+
"x": [1, 2, 3, 4],
38+
"y": ["foo", "bar", "baz", None],
39+
"z": [
40+
"2022-10-23",
41+
]
42+
* 4,
43+
}
3744
)
3845
df["z"] = pd.to_datetime(df["z"])
3946
return df

tests/test_cli.py

+20
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,23 @@ def test_cli_validate_error():
5050
)
5151
# assert result.exit_code == 1
5252
assert "File violates schema:" in result.stderr
53+
54+
55+
def test_cli_update():
56+
from dfschema.cli import app
57+
from dfschema.core.config import CURRENT_PROTOCOL_VERSION
58+
59+
output_path = "active_sales_v2.json"
60+
result = runner.invoke(
61+
app,
62+
[
63+
"update",
64+
"tests/test_schemas/v1/good/active_sales.json",
65+
output_path,
66+
],
67+
)
68+
69+
assert result.exit_code == 0, result.stdout
70+
71+
string_to_be = f"Writing with `{CURRENT_PROTOCOL_VERSION}` to `{output_path}`"
72+
assert string_to_be in result.stdout

0 commit comments

Comments
 (0)