Skip to content

Commit

Permalink
Added an ability to check tabular resource's integrity (#249)
Browse files Browse the repository at this point in the history
* Added IntegrityError

* Bump tabulator/tableschema

* Implemented integrity checks

* Added comments

* Fixed memory consumption by the check functions

* Updated comment
  • Loading branch information
roll authored Oct 31, 2019
1 parent 3c246c6 commit 7316088
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 13 deletions.
23 changes: 19 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ For tabular resources it returns `Schema` instance to interact with data schema.

- `(tableschema.Schema)` - returns schema class instance

#### `resource.iter(keyed=False, extended=False, cast=True, relations=False)`
#### `resource.iter(keyed=False, extended=False, cast=True, integrity=False, relations=False)`

> Only for tabular resources
Expand All @@ -497,14 +497,15 @@ Iter through the table data and emits rows cast based on table schema (async for
- `keyed (bool)` - iter keyed rows
- `extended (bool)` - iter extended rows
- `cast (bool)` - disable data casting if false
- `integrity (bool)` - if true actual size in BYTES and SHA256 hash of the file will be checked against `descriptor.bytes` and `descriptor.hash` (other hashing algorithms are not supported and will be skipped silently)
- `relations (bool)` - if true foreign key fields will be checked and resolved to its references
- `(exceptions.DataPackageException)` - raises any error occured in this process
- `(any[]/any{})` - yields rows:
- `[value1, value2]` - base
- `{header1: value1, header2: value2}` - keyed
- `[rowNumber, [header1, header2], [value1, value2]]` - extended

#### `resource.read(keyed=False, extended=False, cast=True, relations=False, limit=None)`
#### `resource.read(keyed=False, extended=False, cast=True, integrity=False, relations=False, limit=None)`

> Only for tabular resources
Expand All @@ -513,18 +514,28 @@ Read the whole table and returns as array of rows. Count of rows could be limite
- `keyed (bool)` - flag to emit keyed rows
- `extended (bool)` - flag to emit extended rows
- `cast (bool)` - flag to disable data casting if false
- `integrity (bool)` - if true actual size in BYTES and SHA256 hash of the file will be checked against `descriptor.bytes` and `descriptor.hash` (other hashing algorithms are not supported and will be skipped silently)
- `relations (bool)` - if true foreign key fields will be checked and resolved to its references
- `limit (int)` - integer limit of rows to return
- `(exceptions.DataPackageException)` - raises any error occured in this process
- `(list[])` - returns array of rows (see `table.iter`)

#### `resource.check_integrity()`

> Only for tabular resources
It checks size in BYTES and SHA256 hash of the file against `descriptor.bytes` and `descriptor.hash` (other hashing algorithms are not supported and will be skipped silently).

- `(exceptions.IntegrityError)` - raises if there are integrity issues
- `(bool)` - returns True if no issues

#### `resource.check_relations()`

> Only for tabular resources
It checks foreign keys and raises an exception if there are integrity issues.

- `(exceptions.RelationError)` - raises if there are integrity issues
- `(exceptions.RelationError)` - raises if there are relation issues
- `(bool)` - returns True if no issues

#### `resource.raw_iter(stream=False)`
Expand Down Expand Up @@ -908,10 +919,14 @@ All validation errors.

All value cast errors.

#### `exceptions.RelationError`
#### `exceptions.IntegrityError`

All integrity errors.

#### `exceptions.RelationError`

All relation errors.

#### `exceptions.StorageError`

All storage errors.
Expand Down
1 change: 1 addition & 0 deletions datapackage/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
LoadError = tableschema.exceptions.LoadError
ValidationError = tableschema.exceptions.ValidationError
CastError = tableschema.exceptions.CastError
IntegrityError = tableschema.exceptions.IntegrityError
RelationError = tableschema.exceptions.RelationError
StorageError = tableschema.exceptions.StorageError

Expand Down
9 changes: 9 additions & 0 deletions datapackage/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,12 @@ def is_safe_path(path):
]

return not any(unsafeness_conditions)


def extract_sha256_hash(hash):
"""Extrach SHA256 hash or return None
"""
prefix = 'sha256:'
if hash and hash.startswith(prefix):
return hash.replace(prefix, '')
return None
39 changes: 32 additions & 7 deletions datapackage/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def schema(self):
return None
return self.__get_table().schema

def iter(self, relations=False, **options):
def iter(self, integrity=False, relations=False, **options):
"""https://github.com/frictionlessdata/datapackage-py#resource
"""

Expand All @@ -179,13 +179,18 @@ def iter(self, relations=False, **options):
message = 'Methods iter/read are not supported for non tabular data'
raise exceptions.DataPackageException(message)

# Get integrity
if integrity:
integrity = self.__get_integrity()

# Get relations
if relations:
relations = self.__get_relations()

return self.__get_table().iter(relations=relations, **options)
return self.__get_table().iter(
integrity=integrity, relations=relations, **options)

def read(self, relations=False, foreign_keys_values=False, **options):
def read(self, integrity=False, relations=False, foreign_keys_values=False, **options):
"""https://github.com/frictionlessdata/datapackage-py#resource
"""

Expand All @@ -194,17 +199,31 @@ def read(self, relations=False, foreign_keys_values=False, **options):
message = 'Methods iter/read are not supported for non tabular data'
raise exceptions.DataPackageException(message)

# Get integrity
if integrity:
integrity = self.__get_integrity()

# Get relations
if relations and not foreign_keys_values:
relations = self.__get_relations()

return self.__get_table().read(relations=relations, foreign_keys_values=foreign_keys_values,
**options)
return self.__get_table().read(
integrity=integrity, relations=relations,
foreign_keys_values=foreign_keys_values, **options)

def check_integrity(self):
"""https://github.com/frictionlessdata/datapackage-py#resource
"""
# This function will benefit from rebasing it on `resource.raw_iter
for row in self.iter(integrity=True, cast=False):
pass
return True

def check_relations(self, foreign_keys_values=False):
"""https://github.com/frictionlessdata/datapackage-py#resource
"""
self.read(relations=True, foreign_keys_values=foreign_keys_values)
for row in self.iter(relations=True, foreign_keys_values=foreign_keys_values):
pass
return True

def drop_relations(self):
Expand Down Expand Up @@ -371,7 +390,7 @@ def __get_table(self):
source = self.source
if self.multipart:
source = _MultipartSource(self.source, remote=self.remote)
schema = self.descriptor.get('schema')
schema = self.__current_descriptor.get('schema')

# Storage resource
if self.__storage is not None:
Expand Down Expand Up @@ -402,6 +421,12 @@ def __get_table(self):

return self.__table

def __get_integrity(self):
return {
'size': self.__current_descriptor.get('bytes'),
'hash': helpers.extract_sha256_hash(self.__current_descriptor.get('hash')),
}

def __get_relations(self):
if not self.__relations:

Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def read(*paths):
'jsonschema>=2.5',
'unicodecsv>=0.14',
'jsonpointer>=1.10',
'tableschema>=1.1.0',
'tabulator>=1.24.2',
'tableschema>=1.10',
'tabulator>=1.29',
]
TESTS_REQUIRE = [
'mock',
Expand Down
81 changes: 81 additions & 0 deletions tests/test_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import json
import pytest
import httpretty
from copy import deepcopy
from mock import Mock, ANY
from functools import partial
from tableschema import Storage
Expand Down Expand Up @@ -608,6 +609,86 @@ def test_save_data_to_storage():
storage.write.assert_called_with('data', ANY)


# Integrity

DESCRIPTOR = {
'name': 'data.csv',
'path': 'data/data.csv',
'bytes': 63,
'hash': 'sha256:328adab247692a1a405e83c2625d52e366389eabf8a1824931187877e8644774',
}

def test_read_integrity():
descriptor = deepcopy(DESCRIPTOR)
resource = Resource(descriptor)
resource.read(integrity=True)
assert True


def test_read_integrity_error():
descriptor = deepcopy(DESCRIPTOR)
descriptor['bytes'] += 1
descriptor['hash'] += 'a'
resource = Resource(descriptor)
with pytest.raises(exceptions.IntegrityError) as excinfo:
resource.read(integrity=True)
assert str(DESCRIPTOR['bytes']) in str(excinfo.value)
assert DESCRIPTOR['hash'].replace('sha256:', '') in str(excinfo.value)


def test_read_integrity_size():
descriptor = deepcopy(DESCRIPTOR)
descriptor['hash'] = None
resource = Resource(descriptor)
resource.read(integrity=True)
assert True


def test_read_integrity_size_error():
descriptor = deepcopy(DESCRIPTOR)
descriptor['bytes'] += 1
descriptor['hash'] = None
resource = Resource(descriptor)
with pytest.raises(exceptions.IntegrityError) as excinfo:
resource.read(integrity=True)
assert str(DESCRIPTOR['bytes']) in str(excinfo.value)


def test_read_integrity_hash():
descriptor = deepcopy(DESCRIPTOR)
descriptor['bytes'] = None
resource = Resource(descriptor)
resource.read(integrity=True)
assert True


def test_read_integrity_hash_error():
descriptor = deepcopy(DESCRIPTOR)
descriptor['bytes'] = None
descriptor['hash'] += 'a'
resource = Resource(descriptor)
with pytest.raises(exceptions.IntegrityError) as excinfo:
resource.read(integrity=True)
assert DESCRIPTOR['hash'].replace('sha256:', '') in str(excinfo.value)


def test_check_integrity():
descriptor = deepcopy(DESCRIPTOR)
resource = Resource(descriptor)
assert resource.check_integrity()


def test_check_integrity_error():
descriptor = deepcopy(DESCRIPTOR)
descriptor['bytes'] += 1
descriptor['hash'] += 'a'
resource = Resource(descriptor)
with pytest.raises(exceptions.IntegrityError) as excinfo:
resource.check_integrity()
assert str(DESCRIPTOR['bytes']) in str(excinfo.value)
assert DESCRIPTOR['hash'].replace('sha256:', '') in str(excinfo.value)


# Deprecated

def test_data():
Expand Down

0 comments on commit 7316088

Please sign in to comment.