From ee90c792c5be514b6e3413310e1e6bdee074a3b9 Mon Sep 17 00:00:00 2001 From: Brendan Berg Date: Sun, 11 Sep 2022 13:06:21 -0400 Subject: [PATCH 1/6] initial python wip --- python/src/__init__.py | 18 +++ python/src/combinators.py | 10 ++ python/src/format.py | 16 ++ python/src/parse.py | 302 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 346 insertions(+) create mode 100644 python/src/__init__.py create mode 100644 python/src/combinators.py create mode 100644 python/src/format.py create mode 100644 python/src/parse.py diff --git a/python/src/__init__.py b/python/src/__init__.py new file mode 100644 index 0000000..2febdff --- /dev/null +++ b/python/src/__init__.py @@ -0,0 +1,18 @@ + +import parse + +defaultSerializer = object() + +def parse(input: str): + _ignore, data, error = parse.document(input, 0) + + if error: + raise error + else: + return data + +def serialize(table): + return defaultSerializer.serialize(table) + +def setSerializer(serializer): + defaultSerializer = serializer() diff --git a/python/src/combinators.py b/python/src/combinators.py new file mode 100644 index 0000000..d935504 --- /dev/null +++ b/python/src/combinators.py @@ -0,0 +1,10 @@ + + +def concat(*funcs): + pass + +def altern(*funcs): + pass + +def repeat(*funcs): + pass diff --git a/python/src/format.py b/python/src/format.py new file mode 100644 index 0000000..d6751ea --- /dev/null +++ b/python/src/format.py @@ -0,0 +1,16 @@ +import re + + +class TableFormat(object): + rules = None + + def __init__(self, rules): + rule = re.compile(r'^([A-Z]+)(?::([A-Z]+))?$|^([0-9]+)(?::([0-9]+))?$|^([A-Z]+)([0-9]+)(?::([A-Z]+)([0-9]+))?$') + + self.rules = [] + + for key, props in rules: + if match := rule.match(key): + startRow, endRow, startCol, endCol = (0, -1, 0, -1) + + \ No newline at end of file diff --git a/python/src/parse.py b/python/src/parse.py new file mode 100644 index 0000000..1ecdc01 --- /dev/null +++ b/python/src/parse.py @@ -0,0 +1,302 @@ +import re +from enum import Enum + +from combinators import concat, altern, repeat +from format import TableFormat + + +pattern = { + 'string': re.compile(r'"((?:[^"\n\r\b\\]|\\.)*)"[^\S\r\n]*'), + 'integer': re.compile(r'([+-]?(?:\d+_?)*\d+)[^\S\r\n]*'), + 'float': None, + 'hex': None, + 'exponent': None, + 'date': None, + 'time': None, + 'boolean': None, + 'null': None, + + 'newline': re.compile(r'\n'), + 'comma': None, + 'equals': None, + 'tilde': None, + 'star': None, + 'openBrace': None, + 'closeBrace': None, + + 'version': None, + 'cellRange': None, + 'tag': None, + 'propName': None, +} + +class Token(Enum): + Equals = '=' + Tilde = '~' + Star = '*' + Comma = ',' + Newline = '\n' + OpenBrace = '{' + CloseBrace = '}' + + +def document(input: str, offset: int): + offset, head, error = header(input, offset) + + if error: + return (offset, None, error) + + offset, table, error = data(input, offset) + + if error: + return (offset, None, error) + + table.header = head + + if offset == len(input): + return (offset, table, None) + + offset, table.format, error = format(input, offset) + + if error: + return (offset, None, error) + + return (offset, table, None) + +def header(input: str, offset: int): + def headerLine(input: str, offset: int): + offset, elt, error = label(input, offset) + + if error: return (offset, [], None) + + offset, matched, error = concat( + repeat(comma, label), newline + )(input, offset) + + if error: return (offset, None, error) + + labels = filter(lambda elt: elt not in (Token.Comma, Token.Newline), matched) + return (offset, [elt] + labels, None) + + start = offset + offset, elts, error = headerLine(input, offset) + + if error: return (offset, None, error) + + offset, verNumber, error = concat(equals, version, newline)(input, offset) + + if error: + return (start, None, error) + elif verNumber[1] != '0.1': + return (start, None, 'invalid version number') + else: + return (offset, elts, None) + +def data(input: str, offset: int): + # TODO: FINISH THIS IMPLEMENTATION + offset, rows, error = repeat(row)(input, offset) + + if error: return (offset, None, error) + else: + count = 0 + breaks = [] + table = {} + + table.breaks = breaks + return (offset, table, None) + +def format(input: str, offset: int): + offset, _ignore, errors = concat(star, newline)(input, offset) + + if errors: + return (offset, None, errors) + + return formatRules(input, offset) + +def formatRules(input: str, offset: int): + offset, lines, error = repeat(formatRule)(input, offset) + + if error: + return (offset, None, error) + elif offset != len(input): + return (offset, None, 'format rule') + + rules = {} + + for key, props in lines: + if key in rules: + rules[key] += props + else: + rules[key] = props + + return (offset, TableFormat(rules), None) + +def formatRule(input: str, offset: int): + offset, result, error = concat(cellRange, properties)(input, offset) + + if error: + return (offset, None, error) + else: + return (offset, {result[0]: result[1:]}, None) + +def cellRange(input: str, offset: int): + if match := pattern['cellRange'].match(input, offset): + return (match.end(), match.groups()[0], None) + else: + return (offset, None, 'cell range') + +def properties(input: str, offset: int): + position, props, error = concat( + openBrace, + concat(tag, repeat(comma, tag)), + closeBrace + )(input, offset) + + if error: + return (position, None, error) + else: + filtered = filter(lambda prop: prop is not Token.Comma, props[1:-1]) + return (position, filtered, None) + +def row(input: str, offset: int): + position, rowData, error = altern( + concat(element, repeat(comma, element), newline), + concat(tilde, newline) + )(input, offset) + + if error: + return (position, None, "element or '~'") + elif rowData and rowData[0] is Token.Tilde: + return (position, Token.Tilde, None) + else: + filtered = filter(lambda elt: elt not in (Token.Comma, Token.Newline), rowData) + return (position, filtered, None) + +def element(input: str, offset: int): + return altern(stringValue, number, booleanValue, nullValue)(input, offset) + +def label(input: str, offset: int): + return altern(stringValue, nullValue)(input, offset) + +def equals(input: str, offset: int): + if match := pattern['equals'].match(input, offset): + return (match.end(), Token.Equals, None) + else: + return (offset, None, 'header separator') + +def star(input: str, offset: int): + if match := pattern['star'].match(input, offset): + return (match.end(), Token.CloseBrace, None) + else: + return (offset, None, 'format separator') + +def tilde(input: str, offset: int): + if match := pattern['tilde'].match(input, offset): + return (match.end(), Token.Tilde, None) + else: + return (offset, None, 'section separator') + +def comma(input: str, offset: int): + if match := pattern['comma'].match(input, offset): + return (match.end(), Token.Comma, None) + else: + return (offset, None, 'comma') + +def newline(input: str, offset: int): + if match := pattern['newline'].match(input, offset): + return (match.end(), Token.Newline, None) + else: + return (offset, None, 'newline') + +def openBrace(input: str, offset: int): + if match := pattern['openBrace'].match(input, offset): + return (match.end(), Token.OpenBrace, None) + else: + return (offset, None, '"{"') + +def closeBrace(input: str, offset: int): + if match := pattern['closeBrace'].match(input, offset): + return (match.end(), Token.CloseBrace, None) + else: + return (offset, None, '"}"') + +def version(input: str, offset: int): + if match := pattern['version'].match(input, offset): + return (match.end(), match.groups()[0], None) + else: + return (offset, None, 'version number') + +def tag(input: str, offset: int): + if match := pattern['propName'].match(input, offset): + return (match.end(), match.groups()[0], None) + else: + return (offset, None, 'Format Property') + +def stringValue(input: str, offset: int): + escapes = re.compile(r'\\["ntfrb\\]|\\u\{([0-9A-Fa-f]{1,8})\}') + + def replace(match): + if codepoint := match.groups(): + return chr(int(codepoint[0], 16)) + else: + chars = { + r'\"': '\"', + r'\n': '\n', + r'\t': '\t', + r'\f': '\f', + r'\r': '\r', + r'\b': '\b', + r'\\': '\\' + } + return chars[match.string] + + if match := pattern['string'].match(input, offset): + value = escapes.sub(replace, match.groups()[0]) + return (match.end(), value, None) + else: + return (offset, None, 'string') + +def number(input: str, offset: int): + return altern(scientific, hexValue, floatValue, intValue)(input, offset) + +def scientific(input: str, offset: int): + if match := pattern['exponent'].match(input, offset): + mantissa = match.groups()[0].replace('_', '') + exponent = match.groups()[1].replace('_', '') + return (match.end(), float(f'{mantissa}e{exponent}'), None) + else: + return (offset, None, 'scientific') + +def floatValue(input: str, offset: int): + if match := pattern['float'].match(input, offset): + value = match.groups()[0].replace('_', '') + return (match.end(), float(value), None) + else: + return (offset, None, 'float') + +def intValue(input: str, offset: int): + if match := pattern['integer'].match(input, offset): + value = match.groups()[0].replace('_', '') + return (match.end(), int(value), None) + else: + return (offset, None, 'integer') + +def hexValue(input: str, offset: int): + if match := pattern['hex'].match(input, offset): + value = match.groups()[0].replace('0x', '', 1).replace('_', '') + return [match.end(), int(value, 16), None] + else: + return (offset, None, 'hexadecimal') + +def booleanValue(input: str, offset: int): + if match := pattern['boolean'].match(input, offset): + value = match.groups()[0] == 'true' + return (match.end(), value, None) + else: + return (offset, None, 'boolean') + +def nullValue(input: str, offset: int): + if match := pattern['null'].match(input, offset): + return (match.end(), None, None) + else: + return (offset, None, 'null') From 59c3ae10a28735ddeb636aaeb16beaa350c3c78d Mon Sep 17 00:00:00 2001 From: Brendan Berg Date: Sun, 11 Sep 2022 23:36:17 -0400 Subject: [PATCH 2/6] python should be feature-complete --- python/src/__init__.py | 10 +++--- python/src/combinators.py | 70 +++++++++++++++++++++++++++++++++++---- python/src/format.py | 69 ++++++++++++++++++++++++++++++++++++-- python/src/parse.py | 58 +++++++++++++++++++------------- python/src/serializers.py | 61 ++++++++++++++++++++++++++++++++++ python/src/table.py | 53 +++++++++++++++++++++++++++++ 6 files changed, 286 insertions(+), 35 deletions(-) create mode 100644 python/src/serializers.py create mode 100644 python/src/table.py diff --git a/python/src/__init__.py b/python/src/__init__.py index 2febdff..e41668c 100644 --- a/python/src/__init__.py +++ b/python/src/__init__.py @@ -1,7 +1,8 @@ - import parse +from serializers import Serializer, TabloSerializer + -defaultSerializer = object() +defaultSerializer: Serializer = TabloSerializer def parse(input: str): _ignore, data, error = parse.document(input, 0) @@ -14,5 +15,6 @@ def parse(input: str): def serialize(table): return defaultSerializer.serialize(table) -def setSerializer(serializer): - defaultSerializer = serializer() +def setSerializer(serializer: Serializer): + global defaultSerializer + defaultSerializer = serializer diff --git a/python/src/combinators.py b/python/src/combinators.py index d935504..71d6380 100644 --- a/python/src/combinators.py +++ b/python/src/combinators.py @@ -1,10 +1,68 @@ +from typing import Iterable -def concat(*funcs): - pass +def concat(*rules): + def combinator(input: str, offset: int): + cursor = offset + results = [] -def altern(*funcs): - pass + for rule in rules: + offset, match, error = rule(input, cursor) -def repeat(*funcs): - pass + if error: + return (cursor, None, error) + else: + cursor = offset + + if isinstance(match, Iterable): + results.extend(match) + else: + results.append(match) + + return (cursor, results, None) + + return combinator + +def altern(*rules): + def combinator(input: str, offset: int): + cursor = offset + errors = [] + + for rule in rules: + offset, result, error = rule(input, cursor) + + if not error: + return (offset, result, None) + else: + cursor = offset + errors.extend(error) + + return (offset, None, f'one of {",".join(errors)}') + + return combinator + +def repeat(*rules): + def combinator(input: str, offset: int): + cursor = offset + results = [] + error = None + + while True: + first, *rest = rules + cursor, result, error = first(input, cursor) + + if error: + break + else: + results.append(result) + + cursor, result, error = concat(*rest)(input, cursor) + + if error: + return (offset, None, error) + else: + results.extend(result) + + return (cursor, results, error) + + return combinator diff --git a/python/src/format.py b/python/src/format.py index d6751ea..6f3181c 100644 --- a/python/src/format.py +++ b/python/src/format.py @@ -11,6 +11,71 @@ def __init__(self, rules): for key, props in rules: if match := rule.match(key): - startRow, endRow, startCol, endCol = (0, -1, 0, -1) + start_row, end_row, start_col, end_col = (0, -1, 0, -1) - \ No newline at end of file + groups = match.groups() + + if groups[0] is not None: + start_col = self._alpha_to_int(groups[0]) + end_col = self._alpha_to_int(groups[1]) if groups[1] is not None else start_col + elif groups[2] is not None: + start_row = int(groups[2]) + end_row = int(groups[3]) if groups[3] is not None else start_row + elif groups[4] is not None: + start_col = self._alpha_to_int(groups[4]) + start_row = int(groups[5]) + end_col = self._alpha_to_int(groups[6]) if groups[6] is not None else start_col + end_row = int(groups[7]) if groups[7] is not None else start_row + + if ((start_row <= end_row or end_row == -1) and + (start_col <= end_col or end_col == -1)): + bounds = (start_col, end_col, start_row, end_row) + self.rules.append(( + bounds, key, props + )) + + + + def get_props(self, col, row): + def applicable(rule): + bounds, _key, _props = rule + start_col, end_col, start_row, end_row = bounds + + return ( + row >= start_row and + (row <= end_row or end_row == -1) and + numeric_column >= start_col and + (numeric_column <= end_col or end_col == -1) + ) + + numeric_column = self._alpha_to_int(col) + + result = [] + + for rule in filter(applicable, self.rules): + _bounds, _key, props = rule + result.extend(props) + + return result + + def get_rules(self): + rules = {} + + for rule in self.rules: + _bounds, key, props = rule + + if key in rules: + rules[key].extend(props) + else: + rules[key] = props + + return rules + + def _alpha_to_int(self, index: str): + alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + value = 0 + + for idx, char in enumerate(reversed(index.split(''))): + value += alphabet.index(char) * 26 ** idx + + return value diff --git a/python/src/parse.py b/python/src/parse.py index 1ecdc01..341fccb 100644 --- a/python/src/parse.py +++ b/python/src/parse.py @@ -1,33 +1,35 @@ +from functools import reduce import re from enum import Enum from combinators import concat, altern, repeat from format import TableFormat +from table import Table pattern = { 'string': re.compile(r'"((?:[^"\n\r\b\\]|\\.)*)"[^\S\r\n]*'), 'integer': re.compile(r'([+-]?(?:\d+_?)*\d+)[^\S\r\n]*'), - 'float': None, - 'hex': None, - 'exponent': None, - 'date': None, - 'time': None, - 'boolean': None, - 'null': None, + 'float': re.compile(r'([+-]?(?:(?:(?:0|[1-9](?:_?\d+)*)\.(?:(?:\d+_?)*\d+)?)|0\.|\.\d+))[^\S\r\n]*'), + 'hex': re.compile(r'([+-]?0x(?:[\dA-Fa-f]+_?)*[\dA-Fa-f]+)[^\S\r\n]*'), + 'exponent': re.compile(r'([+-]?(?:(?:0|[1-9](?:_?\d+)*\.(?:(?:\d+_?)*\d+)?)|0\.|\.\d+|(?:\d+_?)*\d+))[eE]([+-]?(?:\d+_?)*\d+)[^\S\r\n]*'), + 'date': re.compile(r'#(?:(\d{4})(?:-(\d{2})(?:-(\d{2}))?)?)?'), + 'time': re.compile(r'(\d{2})(?::(\d{2})(?::(\d{2})(?:\.(\d{4}))?)?)?(Z|[+-]?\d{4})?'), + 'boolean': re.compile(r'(true|false)[^\S\r\n]*'), + 'null': re.compile(r'-[^\S\r\n]*'), 'newline': re.compile(r'\n'), - 'comma': None, - 'equals': None, - 'tilde': None, - 'star': None, - 'openBrace': None, - 'closeBrace': None, - - 'version': None, - 'cellRange': None, - 'tag': None, - 'propName': None, + 'comma': re.compile(r',[^\S\r\n]*'), + 'equals': re.compile(r'='), + 'tilde': re.compile(r'~'), + 'star': re.compile(r'\*'), + 'openBrace': re.compile(r'{[^\S\r\n]*'), + 'closeBrace': re.compile(r'}[^\S\r\n]*\n'), + + 'version': re.compile(r' ?(\d+\.\d+)'), + 'cellRange': re.compile(r''), + 'tag': re.compile(r'([A-Za-z_][A-Za-z0-9_-]*)[^\S\r\n]*'), + 'propName': re.compile(r'(plain|bold|italic|underline|strike|normal|mono|black|red|orange|yellow|green|blue|violet|grey|white)[^\S\r\n]*'), } class Token(Enum): @@ -93,15 +95,25 @@ def headerLine(input: str, offset: int): return (offset, elts, None) def data(input: str, offset: int): - # TODO: FINISH THIS IMPLEMENTATION offset, rows, error = repeat(row)(input, offset) - if error: return (offset, None, error) + if error: + return (offset, None, error) else: - count = 0 - breaks = [] - table = {} + def process(result, elt): + count, rows, breaks = result + + if elt == Token.Tilde: + breaks.append(count) + else: + count += 1 + result.append(elt) + + return (count, rows, breaks) + + _count, result, breaks = reduce(process, rows, (0, [], [])) + table = Table(None, result, []) table.breaks = breaks return (offset, table, None) diff --git a/python/src/serializers.py b/python/src/serializers.py new file mode 100644 index 0000000..9bad64b --- /dev/null +++ b/python/src/serializers.py @@ -0,0 +1,61 @@ +from table import Table + + +column_labels = {} + +def int_to_alpha(value: int): + alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + + def decay(value: int): + while value > 0: + yield value + value //= 26 + + if value not in column_labels: + result = '' + + for v in decay(value): + result = alphabet[v % 26] + result + + column_labels[value] = result or 'A' + + return column_labels[value] + + +class Serializer(object): + @classmethod + def serialize(cls, table): + raise NotImplementedError() + +class TabloSerializer(Serializer): + @classmethod + def serialize(cls, table): + header = cls.serialize_header(table) + data = cls.serialize_data(table) + format = cls.serialize_format(table.format) + + return f'{header}=0.1\n{data}\n{format}' + + @classmethod + def serialize_header(cls, table: Table): + if not table.header: + return '' + + return ','.join(cls.serialize_item(val) for val in table.header) + '\n' + + @classmethod + def serlialize_data(cls, table: Table): + return '\n'.join(','.join( + cls.serialize_item(elt) for elt in row + ) for row in table.data) + + @classmethod + def serialize_item(cls, item): + if isinstance(item, str): + return f'"{item}"' + elif isinstance(item, (int, float)): + return str(item) + elif isinstance(item, bool): + return 'true' if item else 'false' + elif item is None: + return '-' \ No newline at end of file diff --git a/python/src/table.py b/python/src/table.py new file mode 100644 index 0000000..162ff8f --- /dev/null +++ b/python/src/table.py @@ -0,0 +1,53 @@ +import re +from typing import Any +from format import TableFormat + + +class Table(object): + def __init__(self, header, rows, format): + self.header = header + self.data = rows + self.format = format + self.breaks = [] + + def concat(self, rows): + self.data += rows + + def get(self, column, row): + if isinstance(column, str): + column = self._alpha_to_int(column) + + return self.data[row][column] + + def get_row(self, row): + return self.data[row] + + def __getattribute__(self, name: str) -> Any: + if match := re.match(r'([A-Z]+)?([0-9]+)?', name): + col, row = (None, None) + + if col_str := match.groups()[0]: + col = self._alpha_to_int(col_str) + + if row_str := match.groups()[1]: + row = int(row_str) + + if row is not None and col is not None: + return self.data[row][col] + elif row is not None: + return self.data[row] + elif col is not None: + return (row[col] for row in self.data) + else: + raise KeyError() + else: + raise KeyError() + + def _alpha_to_int(self, index: str): + alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + value = 0 + + for idx, char in enumerate(reversed(index.split(''))): + value += alphabet.index(char) * 26 ** idx + + return value From 5822f6041722d4de8016c088e258b0994755c0ea Mon Sep 17 00:00:00 2001 From: Brendan Berg Date: Mon, 12 Sep 2022 00:07:44 -0400 Subject: [PATCH 3/6] add html serializer --- python/src/serializers.py | 58 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/python/src/serializers.py b/python/src/serializers.py index 9bad64b..be9dd6a 100644 --- a/python/src/serializers.py +++ b/python/src/serializers.py @@ -27,6 +27,7 @@ class Serializer(object): def serialize(cls, table): raise NotImplementedError() + class TabloSerializer(Serializer): @classmethod def serialize(cls, table): @@ -58,4 +59,59 @@ def serialize_item(cls, item): elif isinstance(item, bool): return 'true' if item else 'false' elif item is None: - return '-' \ No newline at end of file + return '-' + + +class HtmlSerializer(Serializer): + @classmethod + def serialize(cls, table: Table): + header = cls.serialize_header(table) + data = cls.serialize_data(table) + + return f'{header}{data}\n
' + + @classmethod + def serialize_header(cls, table: Table): + if not table.header: + return '' + + def serialize_item(index: int, item): + col = int_to_alpha(index) + value = cls.serialize_item(item) + return f'{value}' + + items = (serialize_item(index, item) for index, item in enumerate(table.header)) + + return f'\n \n {"\n ".join(items)}\n ' + + @classmethod + def serialize_data(cls, table: Table): + def serialize_item(col_idx, row_idx, item): + col_str = int_to_alpha(col_idx) + value = cls.serialize_item(item) + + if props := table.format and table.format.get_props(col_str, row_idx): + class_attr = f' class="{" ".join(props)}"' + else: + class_attr = '' + + return f'{value}' + + def serialize_row(row_idx, row): + items = (serialize_item(col_idx, row_idx, item) for col_idx, item in enumerate(row)) + return f'\n {"\n ".join(items)}\n ' + + rows = (serialize_row(index, row) for index, row in enumerate(table.data)) + + return f'\n \n {"\n ".join(rows)}\n ' + + @classmethod + def serialize_item(cls, item): + if isinstance(item, str): + return item + elif isinstance(item, (int, float)): + return str(item) + elif isinstance(item, bool): + return 'True' if item else 'False' + elif item is None: + return '' From 38ea39910d5774bb2f76a886da6f0a88c6f63b5c Mon Sep 17 00:00:00 2001 From: Brendan Berg Date: Mon, 12 Sep 2022 21:24:59 -0400 Subject: [PATCH 4/6] bugfixes and project structure --- .gitignore | 1 + python/README.md | 66 ++++++++++++++++++++++++++++ python/{src => tablo}/__init__.py | 8 ++-- python/{src => tablo}/combinators.py | 11 ++--- python/{src => tablo}/format.py | 6 +-- python/{src => tablo}/parse.py | 16 +++---- python/{src => tablo}/serializers.py | 21 ++++++--- python/{src => tablo}/table.py | 21 ++++++--- 8 files changed, 119 insertions(+), 31 deletions(-) create mode 100644 python/README.md rename python/{src => tablo}/__init__.py (63%) rename python/{src => tablo}/combinators.py (84%) rename python/{src => tablo}/format.py (95%) rename python/{src => tablo}/parse.py (95%) rename python/{src => tablo}/serializers.py (82%) rename python/{src => tablo}/table.py (67%) diff --git a/.gitignore b/.gitignore index 9daa824..4b0b674 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .DS_Store +__pycache__ node_modules diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..6023e17 --- /dev/null +++ b/python/README.md @@ -0,0 +1,66 @@ +# The Tablo File Format + +__tablo__ is a plain text interchange format for tabular data. It is more +expressive than CSV while remaining easy for people to read and write. + +It adds explicit headers, datatypes, and cell formatting to address +shortcomings of delimiter-separated formats like CSV or TSV. __tablo__ solves +delimiter collision issues by having well-defined quoting and escaping rules +that are familiar to users of formats like JSON or common programming languages +like Python or JavaScript. + +## What's Wrong with CSV? + +> the Microsoft version of CSV is a textbook example of how *not* to design a +> textual file format + +—Eric S. Raymond, [*The Art of Unix Programming*][taoup] + +Stated simply, there is no single CSV standard. It exists as a myriad of +informal variants whose implementation varies from vendor to vendor. Character +encodings and escape sequences vary from one application to the next, and the +ambiguities in various edge cases means that the output of one application may +not be readable by another. + +__tablo__ is designed to solve a number of ambiguities and shortcomings in CSV. + +One of the first obvious differences is that header rows are optional, but +well-defined. In other words, a document may or may not contain a header, but +determining whether the document includes a header is always unambiguous. + +A crucial aspect of the __tablo__ format is that it doesn't make assumptions +about the type of data in each cell. If a value is surrounded by quotes, it is +*always* a string. If a value is a number without quotes, it is *always* a +number. If a value is an ISO-8601 formatted date preceded by a hash mark, it +is *always* a datetime. + +## Installation + +Install with `pip`: + +``` +python -m pip install tablo-fyi +``` + +## Usage + +Parsing is accomplished with the `parse` function. + +``` +from tablo import parse, serialize + +data = parse('"name", "age"\n=0.1\n"Tom", 24\n"Jerry", 27\n') + +name = data['A0'] # Retrieves the value in column A, row 0 => 'Tom' +age = data['B1'] # Retrieves the value in column B, row 1 => 27 +``` + +## More Information + +More information can be found in [the __tablo__ specification][spec], and a +set of [example files][examples] can be found in the [project repository][repo]. + +[taoup]: http://www.catb.org/esr/writings/taoup/html/ch05s02.html#id2901882 +[spec]: https://tablo.fyi +[examples]: https://github.com/jotjotdotio/tablo/tree/main/examples +[repo]: https://github.com/jotjotdotio/tablo diff --git a/python/src/__init__.py b/python/tablo/__init__.py similarity index 63% rename from python/src/__init__.py rename to python/tablo/__init__.py index e41668c..c5613e5 100644 --- a/python/src/__init__.py +++ b/python/tablo/__init__.py @@ -1,14 +1,14 @@ -import parse -from serializers import Serializer, TabloSerializer +from tablo.parse import document +from tablo.serializers import Serializer, TabloSerializer defaultSerializer: Serializer = TabloSerializer def parse(input: str): - _ignore, data, error = parse.document(input, 0) + _ignore, data, error = document(input, 0) if error: - raise error + raise ValueError(error) else: return data diff --git a/python/src/combinators.py b/python/tablo/combinators.py similarity index 84% rename from python/src/combinators.py rename to python/tablo/combinators.py index 71d6380..1024695 100644 --- a/python/src/combinators.py +++ b/python/tablo/combinators.py @@ -7,14 +7,14 @@ def combinator(input: str, offset: int): results = [] for rule in rules: - offset, match, error = rule(input, cursor) + newOffset, match, error = rule(input, cursor) if error: - return (cursor, None, error) + return (offset, None, error) else: - cursor = offset + cursor = newOffset - if isinstance(match, Iterable): + if isinstance(match, Iterable) and not isinstance(match, str): results.extend(match) else: results.append(match) @@ -50,8 +50,9 @@ def combinator(input: str, offset: int): while True: first, *rest = rules cursor, result, error = first(input, cursor) - + if error: + error = None break else: results.append(result) diff --git a/python/src/format.py b/python/tablo/format.py similarity index 95% rename from python/src/format.py rename to python/tablo/format.py index 6f3181c..fb7904a 100644 --- a/python/src/format.py +++ b/python/tablo/format.py @@ -4,12 +4,12 @@ class TableFormat(object): rules = None - def __init__(self, rules): + def __init__(self, rules={}): rule = re.compile(r'^([A-Z]+)(?::([A-Z]+))?$|^([0-9]+)(?::([0-9]+))?$|^([A-Z]+)([0-9]+)(?::([A-Z]+)([0-9]+))?$') self.rules = [] - for key, props in rules: + for key, props in rules.items(): if match := rule.match(key): start_row, end_row, start_col, end_col = (0, -1, 0, -1) @@ -75,7 +75,7 @@ def _alpha_to_int(self, index: str): alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' value = 0 - for idx, char in enumerate(reversed(index.split(''))): + for idx, char in enumerate(reversed(index)): value += alphabet.index(char) * 26 ** idx return value diff --git a/python/src/parse.py b/python/tablo/parse.py similarity index 95% rename from python/src/parse.py rename to python/tablo/parse.py index 341fccb..0b58c4e 100644 --- a/python/src/parse.py +++ b/python/tablo/parse.py @@ -2,9 +2,9 @@ import re from enum import Enum -from combinators import concat, altern, repeat -from format import TableFormat -from table import Table +from tablo.combinators import concat, altern, repeat +from tablo.format import TableFormat +from tablo.table import Table pattern = { @@ -27,7 +27,7 @@ 'closeBrace': re.compile(r'}[^\S\r\n]*\n'), 'version': re.compile(r' ?(\d+\.\d+)'), - 'cellRange': re.compile(r''), + 'cellRange': re.compile(r'((?:[A-Z]+[\d]+:[A-Z]+[\d]+)|(?:[A-Z]+:[A-Z]+)|(?:[\d]+:[\d]+)|(?:[A-Z]+[\d]+))[^\S\r\n]'), 'tag': re.compile(r'([A-Za-z_][A-Za-z0-9_-]*)[^\S\r\n]*'), 'propName': re.compile(r'(plain|bold|italic|underline|strike|normal|mono|black|red|orange|yellow|green|blue|violet|grey|white)[^\S\r\n]*'), } @@ -107,13 +107,13 @@ def process(result, elt): breaks.append(count) else: count += 1 - result.append(elt) + rows.append(elt) return (count, rows, breaks) _count, result, breaks = reduce(process, rows, (0, [], [])) - table = Table(None, result, []) + table = Table(None, result, TableFormat()) table.breaks = breaks return (offset, table, None) @@ -149,7 +149,7 @@ def formatRule(input: str, offset: int): if error: return (offset, None, error) else: - return (offset, {result[0]: result[1:]}, None) + return (offset, (result[0], result[1:]), None) def cellRange(input: str, offset: int): if match := pattern['cellRange'].match(input, offset): @@ -182,7 +182,7 @@ def row(input: str, offset: int): return (position, Token.Tilde, None) else: filtered = filter(lambda elt: elt not in (Token.Comma, Token.Newline), rowData) - return (position, filtered, None) + return (position, list(filtered), None) def element(input: str, offset: int): return altern(stringValue, number, booleanValue, nullValue)(input, offset) diff --git a/python/src/serializers.py b/python/tablo/serializers.py similarity index 82% rename from python/src/serializers.py rename to python/tablo/serializers.py index be9dd6a..f10eb75 100644 --- a/python/src/serializers.py +++ b/python/tablo/serializers.py @@ -1,4 +1,5 @@ -from table import Table +from tablo.format import TableFormat +from tablo.table import Table column_labels = {} @@ -45,7 +46,7 @@ def serialize_header(cls, table: Table): return ','.join(cls.serialize_item(val) for val in table.header) + '\n' @classmethod - def serlialize_data(cls, table: Table): + def serialize_data(cls, table: Table): return '\n'.join(','.join( cls.serialize_item(elt) for elt in row ) for row in table.data) @@ -60,6 +61,13 @@ def serialize_item(cls, item): return 'true' if item else 'false' elif item is None: return '-' + + @classmethod + def serialize_format(cls, format: TableFormat): + rules = format.get_rules() + return '*\n' + '\n'.join( + f'{key} {{{",".join(props)}}}' for key, props in rules.items() + ) class HtmlSerializer(Serializer): @@ -82,7 +90,8 @@ def serialize_item(index: int, item): items = (serialize_item(index, item) for index, item in enumerate(table.header)) - return f'\n \n {"\n ".join(items)}\n ' + sep = "\n " + return f'\n \n {sep.join(items)}\n ' @classmethod def serialize_data(cls, table: Table): @@ -99,11 +108,13 @@ def serialize_item(col_idx, row_idx, item): def serialize_row(row_idx, row): items = (serialize_item(col_idx, row_idx, item) for col_idx, item in enumerate(row)) - return f'\n {"\n ".join(items)}\n ' + sep = "\n " + return f'\n {sep.join(items)}\n ' rows = (serialize_row(index, row) for index, row in enumerate(table.data)) - return f'\n \n {"\n ".join(rows)}\n ' + sep = "\n " + return f'\n \n {sep.join(rows)}\n ' @classmethod def serialize_item(cls, item): diff --git a/python/src/table.py b/python/tablo/table.py similarity index 67% rename from python/src/table.py rename to python/tablo/table.py index 162ff8f..bd8a58a 100644 --- a/python/src/table.py +++ b/python/tablo/table.py @@ -1,6 +1,6 @@ import re from typing import Any -from format import TableFormat +from tablo.format import TableFormat class Table(object): @@ -22,7 +22,7 @@ def get(self, column, row): def get_row(self, row): return self.data[row] - def __getattribute__(self, name: str) -> Any: + def __getitem__(self, name: str) -> Any: if match := re.match(r'([A-Z]+)?([0-9]+)?', name): col, row = (None, None) @@ -33,11 +33,20 @@ def __getattribute__(self, name: str) -> Any: row = int(row_str) if row is not None and col is not None: - return self.data[row][col] + try: + return self.data[row][col] + except IndexError: + raise KeyError() elif row is not None: - return self.data[row] + try: + return self.data[row] + except IndexError: + raise KeyError() elif col is not None: - return (row[col] for row in self.data) + try: + return (row[col] for row in self.data) + except IndexError: + raise KeyError() else: raise KeyError() else: @@ -47,7 +56,7 @@ def _alpha_to_int(self, index: str): alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' value = 0 - for idx, char in enumerate(reversed(index.split(''))): + for idx, char in enumerate(reversed(index)): value += alphabet.index(char) * 26 ** idx return value From a2323eacba93b00d5720bb275c90d4e37d15046b Mon Sep 17 00:00:00 2001 From: Brendan Berg Date: Mon, 12 Sep 2022 22:38:13 -0400 Subject: [PATCH 5/6] housekeeping --- .gitignore | 7 +++++++ python/README.md | 4 ++-- python/requirements.txt | 3 +++ python/setup.py | 21 +++++++++++++++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 python/requirements.txt create mode 100644 python/setup.py diff --git a/.gitignore b/.gitignore index 4b0b674..4c13563 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,10 @@ .DS_Store + +# Python __pycache__ +python/build +python/dist +*.egg-info + +# TypeScript node_modules diff --git a/python/README.md b/python/README.md index 6023e17..ae553e3 100644 --- a/python/README.md +++ b/python/README.md @@ -47,9 +47,9 @@ python -m pip install tablo-fyi Parsing is accomplished with the `parse` function. ``` -from tablo import parse, serialize +import tablo -data = parse('"name", "age"\n=0.1\n"Tom", 24\n"Jerry", 27\n') +data = tablo.parse('"name", "age"\n=0.1\n"Tom", 24\n"Jerry", 27\n') name = data['A0'] # Retrieves the value in column A, row 0 => 'Tom' age = data['B1'] # Retrieves the value in column B, row 1 => 27 diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000..b1d7d60 --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,3 @@ +hypothesis==6.54.5 +twine==4.0.1 +wheel==0.37.1 diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000..edfd3b0 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,21 @@ +import pathlib +from setuptools import setup, find_packages + + +BASE_DIR = pathlib.Path(__file__).parent + +config = { + 'name': 'tablo-fyi', + 'version': '0.4.5', + 'description': "A tabular data format that doesn't make you want to pull your hair out", + 'long_description': (BASE_DIR / "README.md").read_text(), + 'long_description_content_type': 'text/markdown', + 'author': 'Brendan Berg', + 'author_email': 'brendan@berg.industries', + 'license': 'MIT', + 'url': 'https://github.com/jotjotdotio/tablo', + 'install_requires': [], + 'packages': find_packages() +} + +setup(**config) From 6a10fe9f6c104bf86a835c8a563d5f818e40deae Mon Sep 17 00:00:00 2001 From: Brendan Berg Date: Thu, 13 Oct 2022 21:52:52 -0400 Subject: [PATCH 6/6] wip --- .gitignore | 2 ++ python/getting-started.md | 21 +++++++++++++++++++++ python/test/__init__.py | 4 ++++ python/test/header.test.py | 11 +++++++++++ python/test/parser.test.py | 0 5 files changed, 38 insertions(+) create mode 100644 python/getting-started.md create mode 100644 python/test/__init__.py create mode 100644 python/test/header.test.py create mode 100644 python/test/parser.test.py diff --git a/.gitignore b/.gitignore index 4c13563..e64a58a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ __pycache__ python/build python/dist *.egg-info +*.pyc +*.pyo # TypeScript node_modules diff --git a/python/getting-started.md b/python/getting-started.md new file mode 100644 index 0000000..cd6aeec --- /dev/null +++ b/python/getting-started.md @@ -0,0 +1,21 @@ +# Development Guide + +## Installing + +1. Run `python -m pip install -r requirements.txt` to install all development dependencies + +## Testing + +1. Run `python -m unittest test` to run the test suite + +## Building + +1. Run `python setup.py sdist bdist_wheel` to build the source and binary +distributions + +## Distributing + +1. Run `twine upload -r testpypi dist/*` to upload build artifacts the +PyPI test environment +2. Verify that everything looks right +3. Run `twine upload dist/*` to upload build artifacts to PyPI diff --git a/python/test/__init__.py b/python/test/__init__.py new file mode 100644 index 0000000..1c35782 --- /dev/null +++ b/python/test/__init__.py @@ -0,0 +1,4 @@ +import unittest + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/python/test/header.test.py b/python/test/header.test.py new file mode 100644 index 0000000..4c37a11 --- /dev/null +++ b/python/test/header.test.py @@ -0,0 +1,11 @@ +from hypothesis import given +from hypothesis.strategies import text + +from tablo import parse + + +@given(text()) +def test_decode_inverts_encode(s): + input = f'"{s}"\n=0.1\n' + offset, result, error = parse.header(input, 0) + assert(result.header[0] == s) diff --git a/python/test/parser.test.py b/python/test/parser.test.py new file mode 100644 index 0000000..e69de29