Skip to content

Commit b650a18

Browse files
committed
CSV sniffer sniffs the full first row for dialect detection
CSV sniffer used to sniff a fixed size of 2048 Bytes of any given file. However, this led to wrong guessed delimiter if the first row is larger than 2048 Bytes. The issue is fixed and accuracy is also improved by allowing the sniffer to adaptively sniff the 1st full row as long as it is within the max render file size.
1 parent d1fac6d commit b650a18

File tree

2 files changed

+86
-6
lines changed

2 files changed

+86
-6
lines changed

mfr/extensions/tabular/libs/stdlib_tools.py

Lines changed: 83 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,47 @@
11
import re
22
import csv
3+
import logging
34

4-
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
55
from mfr.extensions.tabular import utilities
6+
from mfr.extensions.tabular.settings import MAX_FILE_SIZE, INIT_SNIFF_SIZE
7+
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
8+
9+
logger = logging.getLogger(__name__)
610

711

812
def csv_stdlib(fp):
913
"""Read and convert a csv file to JSON format using the python standard library
10-
:param fp: File pointer object
11-
:return: tuple of table headers and data
14+
15+
Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to
16+
effectively detect the correct dialect of the file.
17+
18+
:param fp: the file pointer object
19+
:return: a tuple of table headers and data
1220
"""
13-
data = fp.read(2048)
21+
22+
# Prepare the first row for sniffing
23+
data = fp.read(INIT_SNIFF_SIZE)
24+
data = _trim_or_append_data(fp, data, INIT_SNIFF_SIZE, 0)
25+
26+
# Reset the file pointer
1427
fp.seek(0)
1528

29+
# Sniff the first row to find a matching format
1630
try:
1731
dialect = csv.Sniffer().sniff(data)
1832
except csv.Error:
1933
dialect = csv.excel
2034
else:
2135
_set_dialect_quote_attrs(dialect, data)
2236

37+
# Explicitly delete data when it is on longer used.
2338
del data
39+
40+
# Create the CSV reader with the detected dialect
2441
reader = csv.DictReader(fp, dialect=dialect)
42+
43+
# Update the reader field names to avoid duplicate column names when performing row extraction
2544
columns = []
26-
# update the reader field names to avoid duplicate column names when performing row extraction
2745
for idx, fieldname in enumerate(reader.fieldnames or []):
2846
column_count = sum(1 for column in columns if fieldname == column['name'])
2947
if column_count:
@@ -92,3 +110,63 @@ def _set_dialect_quote_attrs(dialect, data):
92110
dialect.quotechar = '"'
93111
if re.search('"""[[({]\'.+\',', data):
94112
dialect.doublequote = True
113+
114+
115+
def _trim_or_append_data(fp, text, read_size, size_to_sniff, max_render_size=MAX_FILE_SIZE):
116+
"""Recursively read data from a file and return its full first row. The file starts with
117+
``text`` and the file pointer points to the next character immediately after `text`.
118+
119+
:param fp: the file pointer from which data is read
120+
:param text: the current text chunk to check the new line character
121+
:param read_size: the last read size when `fp.read()` is called
122+
:param size_to_sniff: the accumulated size fo the text to sniff
123+
:param max_render_size: the max file size for render
124+
:return: the first row of the file in string
125+
"""
126+
127+
# Return on empty text. This handles the corner case where the CSV is empty or only contains
128+
# one line without any new line characters.
129+
if len(text) == 0:
130+
return ''
131+
132+
# Try to find the first new line character in the text chunk
133+
index = _find_new_line(text)
134+
# If found, return the trimmed substring
135+
if index != -1:
136+
return text[:index]
137+
# Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text
138+
size_to_sniff += read_size
139+
read_size *= 2
140+
more_text = fp.read(read_size)
141+
142+
# If text to sniff now goes over the max file size limit, raise the renderer error since there
143+
# is no need to sniff when the file is already too large to be rendered.
144+
if size_to_sniff + len(more_text) >= max_render_size:
145+
raise TabularRendererError(
146+
'The first row of this file is too large for the sniffer to detect the dialect. '
147+
'Please download and view it locally.',
148+
code=400,
149+
extension='csv'
150+
)
151+
# If the size is still within the limit, recursively check `more_text`
152+
return text + _trim_or_append_data(fp, more_text, read_size, size_to_sniff,
153+
max_render_size=max_render_size)
154+
155+
156+
def _find_new_line(text):
157+
"""In the given text string, find the index of the first occurrence of any of the three types
158+
of new line character. Note: '\n\r' is not a new line character but two, one LF and one CR.
159+
160+
1. \r\n Carriage Return (CR) and Line Feed (LF), must be checked first
161+
2. \n LF
162+
3. \r CR
163+
164+
:param text: the text string to check
165+
:return: the index of the first new line character if found. Otherwise, return -1.
166+
"""
167+
index = text.find('\r\n')
168+
if index == -1:
169+
index = text.find('\n')
170+
if index == -1:
171+
index = text.find('\r')
172+
return index

mfr/extensions/tabular/settings.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44

55
config = settings.child('TABULAR_EXTENSION_CONFIG')
66

7-
MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10Mb
7+
MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10MB
88
MAX_SIZE = int(config.get('MAX_SIZE', 10000)) # max number of rows or columns allowed.
99
TABLE_WIDTH = int(config.get('TABLE_WIDTH', 700))
1010
TABLE_HEIGHT = int(config.get('TABLE_HEIGHT', 600))
1111

12+
INIT_SNIFF_SIZE = int(config.get('INIT_SNIFF_SIZE', 2 * 1024)) # 2KB
13+
1214
LIBS = config.get_object('LIBS', {
1315
'.csv': [libs.csv_stdlib],
1416
'.tsv': [libs.csv_stdlib],

0 commit comments

Comments
 (0)