diff --git a/.gitignore b/.gitignore index 9daa824..e64a58a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,12 @@ .DS_Store + +# Python +__pycache__ +python/build +python/dist +*.egg-info +*.pyc +*.pyo + +# TypeScript node_modules diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..ae553e3 --- /dev/null +++ b/python/README.md @@ -0,0 +1,66 @@ +# The Tablo File Format + +__tablo__ is a plain text interchange format for tabular data. It is more +expressive than CSV while remaining easy for people to read and write. + +It adds explicit headers, datatypes, and cell formatting to address +shortcomings of delimiter-separated formats like CSV or TSV. __tablo__ solves +delimiter collision issues by having well-defined quoting and escaping rules +that are familiar to users of formats like JSON or common programming languages +like Python or JavaScript. + +## What's Wrong with CSV? + +> the Microsoft version of CSV is a textbook example of how *not* to design a +> textual file format + +—Eric S. Raymond, [*The Art of Unix Programming*][taoup] + +Stated simply, there is no single CSV standard. It exists as a myriad of +informal variants whose implementation varies from vendor to vendor. Character +encodings and escape sequences vary from one application to the next, and the +ambiguities in various edge cases means that the output of one application may +not be readable by another. + +__tablo__ is designed to solve a number of ambiguities and shortcomings in CSV. + +One of the first obvious differences is that header rows are optional, but +well-defined. In other words, a document may or may not contain a header, but +determining whether the document includes a header is always unambiguous. + +A crucial aspect of the __tablo__ format is that it doesn't make assumptions +about the type of data in each cell. If a value is surrounded by quotes, it is +*always* a string. If a value is a number without quotes, it is *always* a +number. If a value is an ISO-8601 formatted date preceded by a hash mark, it +is *always* a datetime. + +## Installation + +Install with `pip`: + +``` +python -m pip install tablo-fyi +``` + +## Usage + +Parsing is accomplished with the `parse` function. + +``` +import tablo + +data = tablo.parse('"name", "age"\n=0.1\n"Tom", 24\n"Jerry", 27\n') + +name = data['A0'] # Retrieves the value in column A, row 0 => 'Tom' +age = data['B1'] # Retrieves the value in column B, row 1 => 27 +``` + +## More Information + +More information can be found in [the __tablo__ specification][spec], and a +set of [example files][examples] can be found in the [project repository][repo]. + +[taoup]: http://www.catb.org/esr/writings/taoup/html/ch05s02.html#id2901882 +[spec]: https://tablo.fyi +[examples]: https://github.com/jotjotdotio/tablo/tree/main/examples +[repo]: https://github.com/jotjotdotio/tablo diff --git a/python/getting-started.md b/python/getting-started.md new file mode 100644 index 0000000..cd6aeec --- /dev/null +++ b/python/getting-started.md @@ -0,0 +1,21 @@ +# Development Guide + +## Installing + +1. Run `python -m pip install -r requirements.txt` to install all development dependencies + +## Testing + +1. Run `python -m unittest test` to run the test suite + +## Building + +1. Run `python setup.py sdist bdist_wheel` to build the source and binary +distributions + +## Distributing + +1. Run `twine upload -r testpypi dist/*` to upload build artifacts the +PyPI test environment +2. Verify that everything looks right +3. Run `twine upload dist/*` to upload build artifacts to PyPI diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000..b1d7d60 --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,3 @@ +hypothesis==6.54.5 +twine==4.0.1 +wheel==0.37.1 diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000..edfd3b0 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,21 @@ +import pathlib +from setuptools import setup, find_packages + + +BASE_DIR = pathlib.Path(__file__).parent + +config = { + 'name': 'tablo-fyi', + 'version': '0.4.5', + 'description': "A tabular data format that doesn't make you want to pull your hair out", + 'long_description': (BASE_DIR / "README.md").read_text(), + 'long_description_content_type': 'text/markdown', + 'author': 'Brendan Berg', + 'author_email': 'brendan@berg.industries', + 'license': 'MIT', + 'url': 'https://github.com/jotjotdotio/tablo', + 'install_requires': [], + 'packages': find_packages() +} + +setup(**config) diff --git a/python/tablo/__init__.py b/python/tablo/__init__.py new file mode 100644 index 0000000..c5613e5 --- /dev/null +++ b/python/tablo/__init__.py @@ -0,0 +1,20 @@ +from tablo.parse import document +from tablo.serializers import Serializer, TabloSerializer + + +defaultSerializer: Serializer = TabloSerializer + +def parse(input: str): + _ignore, data, error = document(input, 0) + + if error: + raise ValueError(error) + else: + return data + +def serialize(table): + return defaultSerializer.serialize(table) + +def setSerializer(serializer: Serializer): + global defaultSerializer + defaultSerializer = serializer diff --git a/python/tablo/combinators.py b/python/tablo/combinators.py new file mode 100644 index 0000000..1024695 --- /dev/null +++ b/python/tablo/combinators.py @@ -0,0 +1,69 @@ +from typing import Iterable + + +def concat(*rules): + def combinator(input: str, offset: int): + cursor = offset + results = [] + + for rule in rules: + newOffset, match, error = rule(input, cursor) + + if error: + return (offset, None, error) + else: + cursor = newOffset + + if isinstance(match, Iterable) and not isinstance(match, str): + results.extend(match) + else: + results.append(match) + + return (cursor, results, None) + + return combinator + +def altern(*rules): + def combinator(input: str, offset: int): + cursor = offset + errors = [] + + for rule in rules: + offset, result, error = rule(input, cursor) + + if not error: + return (offset, result, None) + else: + cursor = offset + errors.extend(error) + + return (offset, None, f'one of {",".join(errors)}') + + return combinator + +def repeat(*rules): + def combinator(input: str, offset: int): + cursor = offset + results = [] + error = None + + while True: + first, *rest = rules + cursor, result, error = first(input, cursor) + + if error: + error = None + break + else: + results.append(result) + + cursor, result, error = concat(*rest)(input, cursor) + + if error: + return (offset, None, error) + else: + results.extend(result) + + return (cursor, results, error) + + return combinator diff --git a/python/tablo/format.py b/python/tablo/format.py new file mode 100644 index 0000000..fb7904a --- /dev/null +++ b/python/tablo/format.py @@ -0,0 +1,81 @@ +import re + + +class TableFormat(object): + rules = None + + def __init__(self, rules={}): + rule = re.compile(r'^([A-Z]+)(?::([A-Z]+))?$|^([0-9]+)(?::([0-9]+))?$|^([A-Z]+)([0-9]+)(?::([A-Z]+)([0-9]+))?$') + + self.rules = [] + + for key, props in rules.items(): + if match := rule.match(key): + start_row, end_row, start_col, end_col = (0, -1, 0, -1) + + groups = match.groups() + + if groups[0] is not None: + start_col = self._alpha_to_int(groups[0]) + end_col = self._alpha_to_int(groups[1]) if groups[1] is not None else start_col + elif groups[2] is not None: + start_row = int(groups[2]) + end_row = int(groups[3]) if groups[3] is not None else start_row + elif groups[4] is not None: + start_col = self._alpha_to_int(groups[4]) + start_row = int(groups[5]) + end_col = self._alpha_to_int(groups[6]) if groups[6] is not None else start_col + end_row = int(groups[7]) if groups[7] is not None else start_row + + if ((start_row <= end_row or end_row == -1) and + (start_col <= end_col or end_col == -1)): + bounds = (start_col, end_col, start_row, end_row) + self.rules.append(( + bounds, key, props + )) + + + + def get_props(self, col, row): + def applicable(rule): + bounds, _key, _props = rule + start_col, end_col, start_row, end_row = bounds + + return ( + row >= start_row and + (row <= end_row or end_row == -1) and + numeric_column >= start_col and + (numeric_column <= end_col or end_col == -1) + ) + + numeric_column = self._alpha_to_int(col) + + result = [] + + for rule in filter(applicable, self.rules): + _bounds, _key, props = rule + result.extend(props) + + return result + + def get_rules(self): + rules = {} + + for rule in self.rules: + _bounds, key, props = rule + + if key in rules: + rules[key].extend(props) + else: + rules[key] = props + + return rules + + def _alpha_to_int(self, index: str): + alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + value = 0 + + for idx, char in enumerate(reversed(index)): + value += alphabet.index(char) * 26 ** idx + + return value diff --git a/python/tablo/parse.py b/python/tablo/parse.py new file mode 100644 index 0000000..0b58c4e --- /dev/null +++ b/python/tablo/parse.py @@ -0,0 +1,314 @@ +from functools import reduce +import re +from enum import Enum + +from tablo.combinators import concat, altern, repeat +from tablo.format import TableFormat +from tablo.table import Table + + +pattern = { + 'string': re.compile(r'"((?:[^"\n\r\b\\]|\\.)*)"[^\S\r\n]*'), + 'integer': re.compile(r'([+-]?(?:\d+_?)*\d+)[^\S\r\n]*'), + 'float': re.compile(r'([+-]?(?:(?:(?:0|[1-9](?:_?\d+)*)\.(?:(?:\d+_?)*\d+)?)|0\.|\.\d+))[^\S\r\n]*'), + 'hex': re.compile(r'([+-]?0x(?:[\dA-Fa-f]+_?)*[\dA-Fa-f]+)[^\S\r\n]*'), + 'exponent': re.compile(r'([+-]?(?:(?:0|[1-9](?:_?\d+)*\.(?:(?:\d+_?)*\d+)?)|0\.|\.\d+|(?:\d+_?)*\d+))[eE]([+-]?(?:\d+_?)*\d+)[^\S\r\n]*'), + 'date': re.compile(r'#(?:(\d{4})(?:-(\d{2})(?:-(\d{2}))?)?)?'), + 'time': re.compile(r'(\d{2})(?::(\d{2})(?::(\d{2})(?:\.(\d{4}))?)?)?(Z|[+-]?\d{4})?'), + 'boolean': re.compile(r'(true|false)[^\S\r\n]*'), + 'null': re.compile(r'-[^\S\r\n]*'), + + 'newline': re.compile(r'\n'), + 'comma': re.compile(r',[^\S\r\n]*'), + 'equals': re.compile(r'='), + 'tilde': re.compile(r'~'), + 'star': re.compile(r'\*'), + 'openBrace': re.compile(r'{[^\S\r\n]*'), + 'closeBrace': re.compile(r'}[^\S\r\n]*\n'), + + 'version': re.compile(r' ?(\d+\.\d+)'), + 'cellRange': re.compile(r'((?:[A-Z]+[\d]+:[A-Z]+[\d]+)|(?:[A-Z]+:[A-Z]+)|(?:[\d]+:[\d]+)|(?:[A-Z]+[\d]+))[^\S\r\n]'), + 'tag': re.compile(r'([A-Za-z_][A-Za-z0-9_-]*)[^\S\r\n]*'), + 'propName': re.compile(r'(plain|bold|italic|underline|strike|normal|mono|black|red|orange|yellow|green|blue|violet|grey|white)[^\S\r\n]*'), +} + +class Token(Enum): + Equals = '=' + Tilde = '~' + Star = '*' + Comma = ',' + Newline = '\n' + OpenBrace = '{' + CloseBrace = '}' + + +def document(input: str, offset: int): + offset, head, error = header(input, offset) + + if error: + return (offset, None, error) + + offset, table, error = data(input, offset) + + if error: + return (offset, None, error) + + table.header = head + + if offset == len(input): + return (offset, table, None) + + offset, table.format, error = format(input, offset) + + if error: + return (offset, None, error) + + return (offset, table, None) + +def header(input: str, offset: int): + def headerLine(input: str, offset: int): + offset, elt, error = label(input, offset) + + if error: return (offset, [], None) + + offset, matched, error = concat( + repeat(comma, label), newline + )(input, offset) + + if error: return (offset, None, error) + + labels = filter(lambda elt: elt not in (Token.Comma, Token.Newline), matched) + return (offset, [elt] + labels, None) + + start = offset + offset, elts, error = headerLine(input, offset) + + if error: return (offset, None, error) + + offset, verNumber, error = concat(equals, version, newline)(input, offset) + + if error: + return (start, None, error) + elif verNumber[1] != '0.1': + return (start, None, 'invalid version number') + else: + return (offset, elts, None) + +def data(input: str, offset: int): + offset, rows, error = repeat(row)(input, offset) + + if error: + return (offset, None, error) + else: + def process(result, elt): + count, rows, breaks = result + + if elt == Token.Tilde: + breaks.append(count) + else: + count += 1 + rows.append(elt) + + return (count, rows, breaks) + + _count, result, breaks = reduce(process, rows, (0, [], [])) + + table = Table(None, result, TableFormat()) + table.breaks = breaks + return (offset, table, None) + +def format(input: str, offset: int): + offset, _ignore, errors = concat(star, newline)(input, offset) + + if errors: + return (offset, None, errors) + + return formatRules(input, offset) + +def formatRules(input: str, offset: int): + offset, lines, error = repeat(formatRule)(input, offset) + + if error: + return (offset, None, error) + elif offset != len(input): + return (offset, None, 'format rule') + + rules = {} + + for key, props in lines: + if key in rules: + rules[key] += props + else: + rules[key] = props + + return (offset, TableFormat(rules), None) + +def formatRule(input: str, offset: int): + offset, result, error = concat(cellRange, properties)(input, offset) + + if error: + return (offset, None, error) + else: + return (offset, (result[0], result[1:]), None) + +def cellRange(input: str, offset: int): + if match := pattern['cellRange'].match(input, offset): + return (match.end(), match.groups()[0], None) + else: + return (offset, None, 'cell range') + +def properties(input: str, offset: int): + position, props, error = concat( + openBrace, + concat(tag, repeat(comma, tag)), + closeBrace + )(input, offset) + + if error: + return (position, None, error) + else: + filtered = filter(lambda prop: prop is not Token.Comma, props[1:-1]) + return (position, filtered, None) + +def row(input: str, offset: int): + position, rowData, error = altern( + concat(element, repeat(comma, element), newline), + concat(tilde, newline) + )(input, offset) + + if error: + return (position, None, "element or '~'") + elif rowData and rowData[0] is Token.Tilde: + return (position, Token.Tilde, None) + else: + filtered = filter(lambda elt: elt not in (Token.Comma, Token.Newline), rowData) + return (position, list(filtered), None) + +def element(input: str, offset: int): + return altern(stringValue, number, booleanValue, nullValue)(input, offset) + +def label(input: str, offset: int): + return altern(stringValue, nullValue)(input, offset) + +def equals(input: str, offset: int): + if match := pattern['equals'].match(input, offset): + return (match.end(), Token.Equals, None) + else: + return (offset, None, 'header separator') + +def star(input: str, offset: int): + if match := pattern['star'].match(input, offset): + return (match.end(), Token.CloseBrace, None) + else: + return (offset, None, 'format separator') + +def tilde(input: str, offset: int): + if match := pattern['tilde'].match(input, offset): + return (match.end(), Token.Tilde, None) + else: + return (offset, None, 'section separator') + +def comma(input: str, offset: int): + if match := pattern['comma'].match(input, offset): + return (match.end(), Token.Comma, None) + else: + return (offset, None, 'comma') + +def newline(input: str, offset: int): + if match := pattern['newline'].match(input, offset): + return (match.end(), Token.Newline, None) + else: + return (offset, None, 'newline') + +def openBrace(input: str, offset: int): + if match := pattern['openBrace'].match(input, offset): + return (match.end(), Token.OpenBrace, None) + else: + return (offset, None, '"{"') + +def closeBrace(input: str, offset: int): + if match := pattern['closeBrace'].match(input, offset): + return (match.end(), Token.CloseBrace, None) + else: + return (offset, None, '"}"') + +def version(input: str, offset: int): + if match := pattern['version'].match(input, offset): + return (match.end(), match.groups()[0], None) + else: + return (offset, None, 'version number') + +def tag(input: str, offset: int): + if match := pattern['propName'].match(input, offset): + return (match.end(), match.groups()[0], None) + else: + return (offset, None, 'Format Property') + +def stringValue(input: str, offset: int): + escapes = re.compile(r'\\["ntfrb\\]|\\u\{([0-9A-Fa-f]{1,8})\}') + + def replace(match): + if codepoint := match.groups(): + return chr(int(codepoint[0], 16)) + else: + chars = { + r'\"': '\"', + r'\n': '\n', + r'\t': '\t', + r'\f': '\f', + r'\r': '\r', + r'\b': '\b', + r'\\': '\\' + } + return chars[match.string] + + if match := pattern['string'].match(input, offset): + value = escapes.sub(replace, match.groups()[0]) + return (match.end(), value, None) + else: + return (offset, None, 'string') + +def number(input: str, offset: int): + return altern(scientific, hexValue, floatValue, intValue)(input, offset) + +def scientific(input: str, offset: int): + if match := pattern['exponent'].match(input, offset): + mantissa = match.groups()[0].replace('_', '') + exponent = match.groups()[1].replace('_', '') + return (match.end(), float(f'{mantissa}e{exponent}'), None) + else: + return (offset, None, 'scientific') + +def floatValue(input: str, offset: int): + if match := pattern['float'].match(input, offset): + value = match.groups()[0].replace('_', '') + return (match.end(), float(value), None) + else: + return (offset, None, 'float') + +def intValue(input: str, offset: int): + if match := pattern['integer'].match(input, offset): + value = match.groups()[0].replace('_', '') + return (match.end(), int(value), None) + else: + return (offset, None, 'integer') + +def hexValue(input: str, offset: int): + if match := pattern['hex'].match(input, offset): + value = match.groups()[0].replace('0x', '', 1).replace('_', '') + return [match.end(), int(value, 16), None] + else: + return (offset, None, 'hexadecimal') + +def booleanValue(input: str, offset: int): + if match := pattern['boolean'].match(input, offset): + value = match.groups()[0] == 'true' + return (match.end(), value, None) + else: + return (offset, None, 'boolean') + +def nullValue(input: str, offset: int): + if match := pattern['null'].match(input, offset): + return (match.end(), None, None) + else: + return (offset, None, 'null') diff --git a/python/tablo/serializers.py b/python/tablo/serializers.py new file mode 100644 index 0000000..f10eb75 --- /dev/null +++ b/python/tablo/serializers.py @@ -0,0 +1,128 @@ +from tablo.format import TableFormat +from tablo.table import Table + + +column_labels = {} + +def int_to_alpha(value: int): + alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + + def decay(value: int): + while value > 0: + yield value + value //= 26 + + if value not in column_labels: + result = '' + + for v in decay(value): + result = alphabet[v % 26] + result + + column_labels[value] = result or 'A' + + return column_labels[value] + + +class Serializer(object): + @classmethod + def serialize(cls, table): + raise NotImplementedError() + + +class TabloSerializer(Serializer): + @classmethod + def serialize(cls, table): + header = cls.serialize_header(table) + data = cls.serialize_data(table) + format = cls.serialize_format(table.format) + + return f'{header}=0.1\n{data}\n{format}' + + @classmethod + def serialize_header(cls, table: Table): + if not table.header: + return '' + + return ','.join(cls.serialize_item(val) for val in table.header) + '\n' + + @classmethod + def serialize_data(cls, table: Table): + return '\n'.join(','.join( + cls.serialize_item(elt) for elt in row + ) for row in table.data) + + @classmethod + def serialize_item(cls, item): + if isinstance(item, str): + return f'"{item}"' + elif isinstance(item, (int, float)): + return str(item) + elif isinstance(item, bool): + return 'true' if item else 'false' + elif item is None: + return '-' + + @classmethod + def serialize_format(cls, format: TableFormat): + rules = format.get_rules() + return '*\n' + '\n'.join( + f'{key} {{{",".join(props)}}}' for key, props in rules.items() + ) + + +class HtmlSerializer(Serializer): + @classmethod + def serialize(cls, table: Table): + header = cls.serialize_header(table) + data = cls.serialize_data(table) + + return f'{header}{data}\n
' + + @classmethod + def serialize_header(cls, table: Table): + if not table.header: + return '' + + def serialize_item(index: int, item): + col = int_to_alpha(index) + value = cls.serialize_item(item) + return f'{value}' + + items = (serialize_item(index, item) for index, item in enumerate(table.header)) + + sep = "\n " + return f'\n \n {sep.join(items)}\n ' + + @classmethod + def serialize_data(cls, table: Table): + def serialize_item(col_idx, row_idx, item): + col_str = int_to_alpha(col_idx) + value = cls.serialize_item(item) + + if props := table.format and table.format.get_props(col_str, row_idx): + class_attr = f' class="{" ".join(props)}"' + else: + class_attr = '' + + return f'{value}' + + def serialize_row(row_idx, row): + items = (serialize_item(col_idx, row_idx, item) for col_idx, item in enumerate(row)) + sep = "\n " + return f'\n {sep.join(items)}\n ' + + rows = (serialize_row(index, row) for index, row in enumerate(table.data)) + + sep = "\n " + return f'\n \n {sep.join(rows)}\n ' + + @classmethod + def serialize_item(cls, item): + if isinstance(item, str): + return item + elif isinstance(item, (int, float)): + return str(item) + elif isinstance(item, bool): + return 'True' if item else 'False' + elif item is None: + return '' diff --git a/python/tablo/table.py b/python/tablo/table.py new file mode 100644 index 0000000..bd8a58a --- /dev/null +++ b/python/tablo/table.py @@ -0,0 +1,62 @@ +import re +from typing import Any +from tablo.format import TableFormat + + +class Table(object): + def __init__(self, header, rows, format): + self.header = header + self.data = rows + self.format = format + self.breaks = [] + + def concat(self, rows): + self.data += rows + + def get(self, column, row): + if isinstance(column, str): + column = self._alpha_to_int(column) + + return self.data[row][column] + + def get_row(self, row): + return self.data[row] + + def __getitem__(self, name: str) -> Any: + if match := re.match(r'([A-Z]+)?([0-9]+)?', name): + col, row = (None, None) + + if col_str := match.groups()[0]: + col = self._alpha_to_int(col_str) + + if row_str := match.groups()[1]: + row = int(row_str) + + if row is not None and col is not None: + try: + return self.data[row][col] + except IndexError: + raise KeyError() + elif row is not None: + try: + return self.data[row] + except IndexError: + raise KeyError() + elif col is not None: + try: + return (row[col] for row in self.data) + except IndexError: + raise KeyError() + else: + raise KeyError() + else: + raise KeyError() + + def _alpha_to_int(self, index: str): + alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + value = 0 + + for idx, char in enumerate(reversed(index)): + value += alphabet.index(char) * 26 ** idx + + return value diff --git a/python/test/__init__.py b/python/test/__init__.py new file mode 100644 index 0000000..1c35782 --- /dev/null +++ b/python/test/__init__.py @@ -0,0 +1,4 @@ +import unittest + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/python/test/header.test.py b/python/test/header.test.py new file mode 100644 index 0000000..4c37a11 --- /dev/null +++ b/python/test/header.test.py @@ -0,0 +1,11 @@ +from hypothesis import given +from hypothesis.strategies import text + +from tablo import parse + + +@given(text()) +def test_decode_inverts_encode(s): + input = f'"{s}"\n=0.1\n' + offset, result, error = parse.header(input, 0) + assert(result.header[0] == s) diff --git a/python/test/parser.test.py b/python/test/parser.test.py new file mode 100644 index 0000000..e69de29