|
1 | 1 | from tablite.base import BaseTable |
2 | 2 | import numpy as np |
| 3 | +import warnings |
3 | 4 | from tablite.utils import sub_cls_check, type_check, expression_interpreter |
4 | 5 | from tablite.mp_utils import filter_ops |
5 | 6 | from tablite.datatypes import list_to_np_array |
6 | 7 | from tablite.config import Config |
7 | | -from tablite.nimlite import filter as _filter_using_list_of_dicts |
| 8 | +from tablite.nimlite import filter as _filter_using_list_of_dicts_native |
8 | 9 | from tqdm import tqdm as _tqdm |
9 | 10 |
|
10 | 11 |
|
@@ -163,10 +164,184 @@ def _compress_both(T, mask, pbar: _tqdm): |
163 | 164 | pbar.update(pbar_step) |
164 | 165 | return true, false |
165 | 166 |
|
| 167 | +def _filter_using_list_of_dicts(T, expressions, filter_type, pbar: _tqdm): |
| 168 | + """ |
| 169 | + enables filtering across columns for multiple criteria. |
| 170 | +
|
| 171 | + expressions: |
| 172 | +
|
| 173 | + str: Expression that can be compiled and executed row by row. |
| 174 | + exampLe: "all((A==B and C!=4 and 200<D))" |
| 175 | +
|
| 176 | + list of dicts: (example): |
| 177 | +
|
| 178 | + L = [ |
| 179 | + {'column1':'A', 'criteria': "==", 'column2': 'B'}, |
| 180 | + {'column1':'C', 'criteria': "!=", "value2": '4'}, |
| 181 | + {'value1': 200, 'criteria': "<", column2: 'D' } |
| 182 | + ] |
| 183 | +
|
| 184 | + accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2' |
| 185 | +
|
| 186 | + filter_type: 'all' or 'any' |
| 187 | + """ |
| 188 | + for expression in expressions: |
| 189 | + if not isinstance(expression, dict): |
| 190 | + raise TypeError(f"invalid expression: {expression}") |
| 191 | + if not len(expression) == 3: |
| 192 | + raise ValueError(f"expected 3 items, got {expression}") |
| 193 | + x = {"column1", "column2", "criteria", "value1", "value2"} |
| 194 | + if not set(expression.keys()).issubset(x): |
| 195 | + raise ValueError(f"got unknown key: {set(expression.keys()).difference(x)}") |
| 196 | + |
| 197 | + if expression["criteria"] not in filter_ops: |
| 198 | + raise ValueError(f"criteria missing from {expression}") |
| 199 | + |
| 200 | + c1 = expression.get("column1", None) |
| 201 | + if c1 is not None and c1 not in T.columns: |
| 202 | + raise ValueError(f"no such column: {c1}") |
| 203 | + |
| 204 | + v1 = expression.get("value1", None) |
| 205 | + if v1 is not None and c1 is not None: |
| 206 | + raise ValueError("filter can only take 1 left expr element. Got 2.") |
| 207 | + |
| 208 | + c2 = expression.get("column2", None) |
| 209 | + if c2 is not None and c2 not in T.columns: |
| 210 | + raise ValueError(f"no such column: {c2}") |
| 211 | + |
| 212 | + v2 = expression.get("value2", None) |
| 213 | + if v2 is not None and c2 is not None: |
| 214 | + raise ValueError("filter can only take 1 right expression element. Got 2.") |
| 215 | + |
| 216 | + if not isinstance(filter_type, str): |
| 217 | + raise TypeError() |
| 218 | + if filter_type not in {"all", "any"}: |
| 219 | + raise ValueError(f"filter_type: {filter_type} not in ['all', 'any']") |
| 220 | + |
| 221 | + # EVALUATION.... |
| 222 | + # 1. setup a rectangular bitmap for evaluations |
| 223 | + bitmap = np.empty(shape=(len(expressions), len(T)), dtype=bool) |
| 224 | + pbar_div = (len(expressions) * len(list(Config.page_steps(len(T)))) - 1) |
| 225 | + pbar_step = (10 / pbar_div) if pbar_div != 0 else 0 |
| 226 | + # 2. create tasks for evaluations |
| 227 | + for bit_index, expression in enumerate(expressions): |
| 228 | + assert isinstance(expression, dict) |
| 229 | + assert len(expression) == 3 |
| 230 | + c1 = expression.get("column1", None) |
| 231 | + c2 = expression.get("column2", None) |
| 232 | + expr = expression.get("criteria", None) |
| 233 | + assert expr in filter_ops |
| 234 | + v1 = expression.get("value1", None) |
| 235 | + v2 = expression.get("value2", None) |
| 236 | + |
| 237 | + for start, end in Config.page_steps(len(T)): |
| 238 | + if c1 is not None: |
| 239 | + dset_A = T[c1][start:end] |
| 240 | + else: # v1 is active: |
| 241 | + dset_A = np.array([v1] * (end - start)) |
| 242 | + |
| 243 | + if c2 is not None: |
| 244 | + dset_B = T[c2][start:end] |
| 245 | + else: # v2 is active: |
| 246 | + dset_B = np.array([v2] * (end - start)) |
| 247 | + |
| 248 | + if len(dset_A) != len(dset_B): |
| 249 | + raise ValueError( |
| 250 | + f"Assymmetric dataset: {c1} has {len(dset_A)} values, whilst {c2} has {len(dset_B)} values." |
| 251 | + ) |
| 252 | + # Evaluate |
| 253 | + try: |
| 254 | + if expr == ">": |
| 255 | + result = dset_A > dset_B |
| 256 | + elif expr == ">=": |
| 257 | + result = dset_A >= dset_B |
| 258 | + elif expr == "==": |
| 259 | + result = dset_A == dset_B |
| 260 | + elif expr == "<": |
| 261 | + result = dset_A < dset_B |
| 262 | + elif expr == "<=": |
| 263 | + result = dset_A <= dset_B |
| 264 | + elif expr == "!=": |
| 265 | + result = dset_A != dset_B |
| 266 | + else: # it's a python evaluations (slow) |
| 267 | + f = filter_ops.get(expr) |
| 268 | + assert callable(f) |
| 269 | + result = list_to_np_array([f(a, b) for a, b in zip(dset_A, dset_B)]) |
| 270 | + except TypeError: |
| 271 | + def safe_test(f, a, b): |
| 272 | + try: |
| 273 | + return f(a, b) |
| 274 | + except TypeError: |
| 275 | + return False |
| 276 | + f = filter_ops.get(expr) |
| 277 | + assert callable(f) |
| 278 | + result = list_to_np_array([safe_test(f, a, b) for a, b in zip(dset_A, dset_B)]) |
| 279 | + bitmap[bit_index, start:end] = result |
| 280 | + pbar.update(pbar_step) |
| 281 | + |
| 282 | + f = np.all if filter_type == "all" else np.any |
| 283 | + mask = f(bitmap, axis=0) |
| 284 | + # 4. The mask is now created and is no longer needed. |
| 285 | + pbar.update(10 - pbar.n) |
| 286 | + return mask |
| 287 | + |
| 288 | +def filter_non_primitive(T, expressions, filter_type="all", tqdm=_tqdm): |
| 289 | + """ |
| 290 | + OBSOLETE |
| 291 | + filters table |
| 292 | +
|
| 293 | +
|
| 294 | + Args: |
| 295 | + T (Table subclass): Table. |
| 296 | + expressions (list or str): |
| 297 | + str: |
| 298 | + filters based on an expression, such as: |
| 299 | + "all((A==B, C!=4, 200<D))" |
| 300 | + which is interpreted using python's compiler to: |
| 301 | +
|
| 302 | + def _f(A,B,C,D): |
| 303 | + return all((A==B, C!=4, 200<D)) |
| 304 | +
|
| 305 | + list of dicts: (example): |
| 306 | +
|
| 307 | + L = [ |
| 308 | + {'column1':'A', 'criteria': "==", 'column2': 'B'}, |
| 309 | + {'column1':'C', 'criteria': "!=", "value2": '4'}, |
| 310 | + {'value1': 200, 'criteria': "<", column2: 'D' } |
| 311 | + ] |
| 312 | +
|
| 313 | + accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2' |
| 314 | +
|
| 315 | + filter_type (str, optional): Ignored if expressions is str. |
| 316 | + 'all' or 'any'. Defaults to "all". |
| 317 | + tqdm (tqdm, optional): progressbar. Defaults to _tqdm. |
| 318 | +
|
| 319 | + Returns: |
| 320 | + 2xTables: trues, falses |
| 321 | + """ |
| 322 | + # determine method |
| 323 | + warnings.warn("Filter using non-primitive types is not recommended.") |
| 324 | + sub_cls_check(T, BaseTable) |
| 325 | + if len(T) == 0: |
| 326 | + return T.copy(), T.copy() |
| 327 | + |
| 328 | + with tqdm(desc="filter", total=20) as pbar: |
| 329 | + if isinstance(expressions, str): |
| 330 | + mask = _filter_using_expression(T, expressions) |
| 331 | + pbar.update(10) |
| 332 | + elif isinstance(expressions, list): |
| 333 | + mask = _filter_using_list_of_dicts(T, expressions, filter_type, pbar) |
| 334 | + else: |
| 335 | + raise TypeError |
| 336 | + # create new tables |
| 337 | + res = _compress_both(T, mask, pbar=pbar) |
| 338 | + pbar.update(pbar.total - pbar.n) |
| 339 | + |
| 340 | + return res |
166 | 341 |
|
167 | 342 | def filter(T, expressions, filter_type="all", tqdm=_tqdm): |
168 | 343 | """filters table |
169 | | -
|
| 344 | + Note: At the moment only tablite primitive types are supported |
170 | 345 |
|
171 | 346 | Args: |
172 | 347 | T (Table subclass): Table. |
@@ -209,7 +384,7 @@ def _f(A,B,C,D): |
209 | 384 | res = _compress_both(T, mask, pbar=pbar) |
210 | 385 | pbar.update(pbar.total - pbar.n) |
211 | 386 | elif isinstance(expressions, list): |
212 | | - return _filter_using_list_of_dicts(T, expressions, filter_type, tqdm) |
| 387 | + return _filter_using_list_of_dicts_native(T, expressions, filter_type, tqdm) |
213 | 388 | else: |
214 | 389 | raise TypeError |
215 | 390 | # create new tables |
|
0 commit comments