Skip to content
This repository was archived by the owner on Apr 2, 2026. It is now read-only.

Commit 4d58d36

Browse files
authored
Merge pull request #161 from realratchet/master
fix timedeltas and non-primitive filter
2 parents 27f39c6 + eb63209 commit 4d58d36

5 files changed

Lines changed: 238 additions & 27 deletions

File tree

nimlite.nimble

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Package
22

3-
version = "0.3.0"
3+
version = "0.3.1"
44
author = "Ratchet"
55
description = "Utilities for tablite to work with nim"
66
license = "MIT"

nimlite/funcs/filter.nim

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
147147
let pyType = builtins.getTypeName(pyVal)
148148
let obj: PY_ObjectND = (
149149
case pyType
150+
of "NoneType": PY_None
150151
of "int": newPY_Object(pyVal.to(int))
151152
of "float": newPY_Object(pyVal.to(float))
152153
of "bool": newPY_Object(pyVal.to(bool))

nimlite/numpy.nim

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@ type NDArrayTypeDescriptor = enum
2222
D_BOOLEAN
2323
D_INT
2424
D_FLOAT
25-
D_TIME
2625
D_DATE_DAYS
26+
D_TIME_SECONDS
27+
D_TIME_MILISECONDS
28+
D_TIME_MICROSECONDS
2729
D_DATETIME_SECONDS
2830
D_DATETIME_MILISECONDS
2931
D_DATETIME_MICROSECONDS
@@ -523,16 +525,20 @@ proc consumeDescr(header: var string, header_len: int, offset: var int): NDArray
523525
descriptor = NDArrayTypeDescriptor.D_OBJECT
524526
of 'm':
525527
case dt_descriptor:
528+
of "us": NDArrayTypeDescriptor.D_TIME_MICROSECONDS
529+
of "ms": NDArrayTypeDescriptor.D_TIME_MILISECONDS
530+
of "s": NDArrayTypeDescriptor.D_TIME_SECONDS
526531
else: implement(descr)
527532
of 'M':
528-
case dt_descriptor:
529-
of "D":
530-
size = 8
531-
descriptor = NDArrayTypeDescriptor.D_DATE_DAYS
532-
of "us":
533-
size = 8
534-
descriptor = NDArrayTypeDescriptor.D_DATETIME_MICROSECONDS
535-
else: implement(descr)
533+
size = 8
534+
descriptor = (
535+
case dt_descriptor:
536+
of "D": NDArrayTypeDescriptor.D_DATE_DAYS
537+
of "us": NDArrayTypeDescriptor.D_DATETIME_MICROSECONDS
538+
of "ms": NDArrayTypeDescriptor.D_DATETIME_MILISECONDS
539+
of "s": NDArrayTypeDescriptor.D_DATETIME_SECONDS
540+
else: implement(descr)
541+
)
536542
else:
537543
size = parseInt(descr[type_offset+1..descr.len-1])
538544

@@ -659,6 +665,33 @@ proc newDateTimeArray_Microseconds(fh: var File, endianness: Endianness, shape:
659665

660666
return DateTimeNDArray(buf: buf, shape: shape)
661667

668+
proc newTimeArray_Seconds(fh: var File, endianness: Endianness, shape: var Shape): ObjectNDArray {.inline.} =
669+
let data = readPrimitiveBuffer[int64](fh, shape)
670+
let dtypes = {K_TIME: data.len}.toTable
671+
let buf = collect:
672+
for v in data:
673+
newPY_Object(seconds2Duration(float v))
674+
675+
return ObjectNDArray(buf: buf, shape: shape, dtypes: dtypes)
676+
677+
proc newTimeArray_Miliseconds(fh: var File, endianness: Endianness, shape: var Shape): ObjectNDArray {.inline.} =
678+
let data = readPrimitiveBuffer[int64](fh, shape)
679+
let dtypes = {K_TIME: data.len}.toTable
680+
let buf = collect:
681+
for v in data:
682+
newPY_Object(seconds2Duration(float v * 1_000))
683+
684+
return ObjectNDArray(buf: buf, shape: shape, dtypes: dtypes)
685+
686+
proc newTimeArray_Microseconds(fh: var File, endianness: Endianness, shape: var Shape): ObjectNDArray {.inline.} =
687+
let data = readPrimitiveBuffer[int64](fh, shape)
688+
let dtypes = {K_TIME: data.len}.toTable
689+
let buf = collect:
690+
for v in data:
691+
newPY_Object(seconds2Duration(float v * 1_000_000))
692+
693+
return ObjectNDArray(buf: buf, shape: shape, dtypes: dtypes)
694+
662695
template newFloatNDArray(fh: var File, endianness: Endianness, size: int, shape: var Shape) =
663696
case size:
664697
of 4: Float32NDArray(buf: readPrimitiveBuffer[float32](fh, shape), shape: shape)
@@ -711,20 +744,22 @@ proc readPageInfo(fh: var File): (NDArrayDescriptor, bool, Shape) =
711744

712745
proc readNumpy(fh: var File): BaseNDArray =
713746
var ((descrEndianness, descrType, descrSize), _, shape) = readPageInfo(fh)
714-
var page: BaseNDArray
715-
716-
case descrType:
717-
of D_BOOLEAN: page = newBooleanNDArray(fh, shape)
718-
of D_INT: page = newIntNDArray(fh, descrEndianness, descrSize, shape)
719-
of D_FLOAT: page = newFloatNDArray(fh, descrEndianness, descrSize, shape)
720-
of D_UNICODE: page = newUnicodeNDArray(fh, descrEndianness, descrSize, shape)
721-
of D_OBJECT: page = newObjectNDArray(fh, descrEndianness, shape)
722-
of D_DATE_DAYS: page = newDateArray_Days(fh, descrEndianness, shape)
723-
of D_DATETIME_SECONDS: page = newDateTimeArray_Seconds(fh, descrEndianness, shape)
724-
of D_DATETIME_MILISECONDS: page = newDateTimeArray_Miliseconds(fh, descrEndianness, shape)
725-
of D_DATETIME_MICROSECONDS: page = newDateTimeArray_Microseconds(fh, descrEndianness, shape)
726-
else: implement($descrType)
727747

748+
let page = (
749+
case descrType:
750+
of D_BOOLEAN: newBooleanNDArray(fh, shape)
751+
of D_INT: newIntNDArray(fh, descrEndianness, descrSize, shape)
752+
of D_FLOAT: newFloatNDArray(fh, descrEndianness, descrSize, shape)
753+
of D_UNICODE: newUnicodeNDArray(fh, descrEndianness, descrSize, shape)
754+
of D_OBJECT: newObjectNDArray(fh, descrEndianness, shape)
755+
of D_DATE_DAYS: newDateArray_Days(fh, descrEndianness, shape)
756+
of D_DATETIME_SECONDS: newDateTimeArray_Seconds(fh, descrEndianness, shape)
757+
of D_DATETIME_MILISECONDS: newDateTimeArray_Miliseconds(fh, descrEndianness, shape)
758+
of D_DATETIME_MICROSECONDS: newDateTimeArray_Microseconds(fh, descrEndianness, shape)
759+
of D_TIME_SECONDS: newTimeArray_Seconds(fh, descrEndianness, shape)
760+
of D_TIME_MILISECONDS: newTimeArray_Miliseconds(fh, descrEndianness, shape)
761+
of D_TIME_MICROSECONDS: newTimeArray_Microseconds(fh, descrEndianness, shape)
762+
)
728763
return page
729764

730765
proc readNumpy*(path: string): BaseNDArray =

tablite/redux.py

Lines changed: 178 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
from tablite.base import BaseTable
22
import numpy as np
3+
import warnings
34
from tablite.utils import sub_cls_check, type_check, expression_interpreter
45
from tablite.mp_utils import filter_ops
56
from tablite.datatypes import list_to_np_array
67
from tablite.config import Config
7-
from tablite.nimlite import filter as _filter_using_list_of_dicts
8+
from tablite.nimlite import filter as _filter_using_list_of_dicts_native
89
from tqdm import tqdm as _tqdm
910

1011

@@ -163,10 +164,184 @@ def _compress_both(T, mask, pbar: _tqdm):
163164
pbar.update(pbar_step)
164165
return true, false
165166

167+
def _filter_using_list_of_dicts(T, expressions, filter_type, pbar: _tqdm):
168+
"""
169+
enables filtering across columns for multiple criteria.
170+
171+
expressions:
172+
173+
str: Expression that can be compiled and executed row by row.
174+
exampLe: "all((A==B and C!=4 and 200<D))"
175+
176+
list of dicts: (example):
177+
178+
L = [
179+
{'column1':'A', 'criteria': "==", 'column2': 'B'},
180+
{'column1':'C', 'criteria': "!=", "value2": '4'},
181+
{'value1': 200, 'criteria': "<", column2: 'D' }
182+
]
183+
184+
accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'
185+
186+
filter_type: 'all' or 'any'
187+
"""
188+
for expression in expressions:
189+
if not isinstance(expression, dict):
190+
raise TypeError(f"invalid expression: {expression}")
191+
if not len(expression) == 3:
192+
raise ValueError(f"expected 3 items, got {expression}")
193+
x = {"column1", "column2", "criteria", "value1", "value2"}
194+
if not set(expression.keys()).issubset(x):
195+
raise ValueError(f"got unknown key: {set(expression.keys()).difference(x)}")
196+
197+
if expression["criteria"] not in filter_ops:
198+
raise ValueError(f"criteria missing from {expression}")
199+
200+
c1 = expression.get("column1", None)
201+
if c1 is not None and c1 not in T.columns:
202+
raise ValueError(f"no such column: {c1}")
203+
204+
v1 = expression.get("value1", None)
205+
if v1 is not None and c1 is not None:
206+
raise ValueError("filter can only take 1 left expr element. Got 2.")
207+
208+
c2 = expression.get("column2", None)
209+
if c2 is not None and c2 not in T.columns:
210+
raise ValueError(f"no such column: {c2}")
211+
212+
v2 = expression.get("value2", None)
213+
if v2 is not None and c2 is not None:
214+
raise ValueError("filter can only take 1 right expression element. Got 2.")
215+
216+
if not isinstance(filter_type, str):
217+
raise TypeError()
218+
if filter_type not in {"all", "any"}:
219+
raise ValueError(f"filter_type: {filter_type} not in ['all', 'any']")
220+
221+
# EVALUATION....
222+
# 1. setup a rectangular bitmap for evaluations
223+
bitmap = np.empty(shape=(len(expressions), len(T)), dtype=bool)
224+
pbar_div = (len(expressions) * len(list(Config.page_steps(len(T)))) - 1)
225+
pbar_step = (10 / pbar_div) if pbar_div != 0 else 0
226+
# 2. create tasks for evaluations
227+
for bit_index, expression in enumerate(expressions):
228+
assert isinstance(expression, dict)
229+
assert len(expression) == 3
230+
c1 = expression.get("column1", None)
231+
c2 = expression.get("column2", None)
232+
expr = expression.get("criteria", None)
233+
assert expr in filter_ops
234+
v1 = expression.get("value1", None)
235+
v2 = expression.get("value2", None)
236+
237+
for start, end in Config.page_steps(len(T)):
238+
if c1 is not None:
239+
dset_A = T[c1][start:end]
240+
else: # v1 is active:
241+
dset_A = np.array([v1] * (end - start))
242+
243+
if c2 is not None:
244+
dset_B = T[c2][start:end]
245+
else: # v2 is active:
246+
dset_B = np.array([v2] * (end - start))
247+
248+
if len(dset_A) != len(dset_B):
249+
raise ValueError(
250+
f"Assymmetric dataset: {c1} has {len(dset_A)} values, whilst {c2} has {len(dset_B)} values."
251+
)
252+
# Evaluate
253+
try:
254+
if expr == ">":
255+
result = dset_A > dset_B
256+
elif expr == ">=":
257+
result = dset_A >= dset_B
258+
elif expr == "==":
259+
result = dset_A == dset_B
260+
elif expr == "<":
261+
result = dset_A < dset_B
262+
elif expr == "<=":
263+
result = dset_A <= dset_B
264+
elif expr == "!=":
265+
result = dset_A != dset_B
266+
else: # it's a python evaluations (slow)
267+
f = filter_ops.get(expr)
268+
assert callable(f)
269+
result = list_to_np_array([f(a, b) for a, b in zip(dset_A, dset_B)])
270+
except TypeError:
271+
def safe_test(f, a, b):
272+
try:
273+
return f(a, b)
274+
except TypeError:
275+
return False
276+
f = filter_ops.get(expr)
277+
assert callable(f)
278+
result = list_to_np_array([safe_test(f, a, b) for a, b in zip(dset_A, dset_B)])
279+
bitmap[bit_index, start:end] = result
280+
pbar.update(pbar_step)
281+
282+
f = np.all if filter_type == "all" else np.any
283+
mask = f(bitmap, axis=0)
284+
# 4. The mask is now created and is no longer needed.
285+
pbar.update(10 - pbar.n)
286+
return mask
287+
288+
def filter_non_primitive(T, expressions, filter_type="all", tqdm=_tqdm):
289+
"""
290+
OBSOLETE
291+
filters table
292+
293+
294+
Args:
295+
T (Table subclass): Table.
296+
expressions (list or str):
297+
str:
298+
filters based on an expression, such as:
299+
"all((A==B, C!=4, 200<D))"
300+
which is interpreted using python's compiler to:
301+
302+
def _f(A,B,C,D):
303+
return all((A==B, C!=4, 200<D))
304+
305+
list of dicts: (example):
306+
307+
L = [
308+
{'column1':'A', 'criteria': "==", 'column2': 'B'},
309+
{'column1':'C', 'criteria': "!=", "value2": '4'},
310+
{'value1': 200, 'criteria': "<", column2: 'D' }
311+
]
312+
313+
accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'
314+
315+
filter_type (str, optional): Ignored if expressions is str.
316+
'all' or 'any'. Defaults to "all".
317+
tqdm (tqdm, optional): progressbar. Defaults to _tqdm.
318+
319+
Returns:
320+
2xTables: trues, falses
321+
"""
322+
# determine method
323+
warnings.warn("Filter using non-primitive types is not recommended.")
324+
sub_cls_check(T, BaseTable)
325+
if len(T) == 0:
326+
return T.copy(), T.copy()
327+
328+
with tqdm(desc="filter", total=20) as pbar:
329+
if isinstance(expressions, str):
330+
mask = _filter_using_expression(T, expressions)
331+
pbar.update(10)
332+
elif isinstance(expressions, list):
333+
mask = _filter_using_list_of_dicts(T, expressions, filter_type, pbar)
334+
else:
335+
raise TypeError
336+
# create new tables
337+
res = _compress_both(T, mask, pbar=pbar)
338+
pbar.update(pbar.total - pbar.n)
339+
340+
return res
166341

167342
def filter(T, expressions, filter_type="all", tqdm=_tqdm):
168343
"""filters table
169-
344+
Note: At the moment only tablite primitive types are supported
170345
171346
Args:
172347
T (Table subclass): Table.
@@ -209,7 +384,7 @@ def _f(A,B,C,D):
209384
res = _compress_both(T, mask, pbar=pbar)
210385
pbar.update(pbar.total - pbar.n)
211386
elif isinstance(expressions, list):
212-
return _filter_using_list_of_dicts(T, expressions, filter_type, tqdm)
387+
return _filter_using_list_of_dicts_native(T, expressions, filter_type, tqdm)
213388
else:
214389
raise TypeError
215390
# create new tables

tablite/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
major, minor, patch = 2023, 11, 0
1+
major, minor, patch = 2023, 11, 1
22
__version_info__ = (major, minor, patch)
33
__version__ = ".".join(str(i) for i in __version_info__)

0 commit comments

Comments
 (0)