|
1 | 1 | import re
|
2 | 2 | import csv
|
| 3 | +import logging |
3 | 4 |
|
4 |
| -from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError |
5 | 5 | from mfr.extensions.tabular import utilities
|
| 6 | +from mfr.extensions.tabular.settings import MAX_FILE_SIZE, INIT_SNIFF_SIZE |
| 7 | +from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError |
| 8 | + |
| 9 | +logger = logging.getLogger(__name__) |
6 | 10 |
|
7 | 11 |
|
8 | 12 | def csv_stdlib(fp):
|
9 | 13 | """Read and convert a csv file to JSON format using the python standard library
|
10 |
| - :param fp: File pointer object |
11 |
| - :return: tuple of table headers and data |
| 14 | +
|
| 15 | + Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to |
| 16 | + effectively detect the correct dialect of the file. |
| 17 | +
|
| 18 | + :param fp: the file pointer object |
| 19 | + :return: a tuple of table headers and data |
12 | 20 | """
|
13 |
| - data = fp.read(2048) |
| 21 | + |
| 22 | + # Prepare the first row for sniffing |
| 23 | + data = fp.read(INIT_SNIFF_SIZE) |
| 24 | + data = _trim_or_append_data(fp, data, INIT_SNIFF_SIZE, 0) |
| 25 | + |
| 26 | + # Reset the file pointer |
14 | 27 | fp.seek(0)
|
15 | 28 |
|
| 29 | + # Sniff the first row to find a matching format |
16 | 30 | try:
|
17 | 31 | dialect = csv.Sniffer().sniff(data)
|
18 | 32 | except csv.Error:
|
19 | 33 | dialect = csv.excel
|
20 | 34 | else:
|
21 | 35 | _set_dialect_quote_attrs(dialect, data)
|
22 | 36 |
|
| 37 | + # Explicitly delete data when it is on longer used. |
23 | 38 | del data
|
| 39 | + |
| 40 | + # Create the CSV reader with the detected dialect |
24 | 41 | reader = csv.DictReader(fp, dialect=dialect)
|
| 42 | + |
| 43 | + # Update the reader field names to avoid duplicate column names when performing row extraction |
25 | 44 | columns = []
|
26 |
| - # update the reader field names to avoid duplicate column names when performing row extraction |
27 | 45 | for idx, fieldname in enumerate(reader.fieldnames or []):
|
28 | 46 | column_count = sum(1 for column in columns if fieldname == column['name'])
|
29 | 47 | if column_count:
|
@@ -92,3 +110,63 @@ def _set_dialect_quote_attrs(dialect, data):
|
92 | 110 | dialect.quotechar = '"'
|
93 | 111 | if re.search('"""[[({]\'.+\',', data):
|
94 | 112 | dialect.doublequote = True
|
| 113 | + |
| 114 | + |
| 115 | +def _trim_or_append_data(fp, text, read_size, size_to_sniff, max_render_size=MAX_FILE_SIZE): |
| 116 | + """Recursively read data from a file and return its full first row. The file starts with |
| 117 | + ``text`` and the file pointer points to the next character immediately after `text`. |
| 118 | +
|
| 119 | + :param fp: the file pointer from which data is read |
| 120 | + :param text: the current text chunk to check the new line character |
| 121 | + :param read_size: the last read size when `fp.read()` is called |
| 122 | + :param size_to_sniff: the accumulated size fo the text to sniff |
| 123 | + :param max_render_size: the max file size for render |
| 124 | + :return: the first row of the file in string |
| 125 | + """ |
| 126 | + |
| 127 | + # Return on empty text. This handles the corner case where the CSV is empty or only contains |
| 128 | + # one line without any new line characters. |
| 129 | + if len(text) == 0: |
| 130 | + return '' |
| 131 | + |
| 132 | + # Try to find the first new line character in the text chunk |
| 133 | + index = _find_new_line(text) |
| 134 | + # If found, return the trimmed substring |
| 135 | + if index != -1: |
| 136 | + return text[:index] |
| 137 | + # Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text |
| 138 | + size_to_sniff += read_size |
| 139 | + read_size *= 2 |
| 140 | + more_text = fp.read(read_size) |
| 141 | + |
| 142 | + # If text to sniff now goes over the max file size limit, raise the renderer error since there |
| 143 | + # is no need to sniff when the file is already too large to be rendered. |
| 144 | + if size_to_sniff + len(more_text) >= max_render_size: |
| 145 | + raise TabularRendererError( |
| 146 | + 'The first row of this file is too large for the sniffer to detect the dialect. ' |
| 147 | + 'Please download and view it locally.', |
| 148 | + code=400, |
| 149 | + extension='csv' |
| 150 | + ) |
| 151 | + # If the size is still within the limit, recursively check `more_text` |
| 152 | + return text + _trim_or_append_data(fp, more_text, read_size, size_to_sniff, |
| 153 | + max_render_size=max_render_size) |
| 154 | + |
| 155 | + |
| 156 | +def _find_new_line(text): |
| 157 | + """In the given text string, find the index of the first occurrence of any of the three types |
| 158 | + of new line character. Note: '\n\r' is not a new line character but two, one LF and one CR. |
| 159 | +
|
| 160 | + 1. \r\n Carriage Return (CR) and Line Feed (LF), must be checked first |
| 161 | + 2. \n LF |
| 162 | + 3. \r CR |
| 163 | +
|
| 164 | + :param text: the text string to check |
| 165 | + :return: the index of the first new line character if found. Otherwise, return -1. |
| 166 | + """ |
| 167 | + index = text.find('\r\n') |
| 168 | + if index == -1: |
| 169 | + index = text.find('\n') |
| 170 | + if index == -1: |
| 171 | + index = text.find('\r') |
| 172 | + return index |
0 commit comments