CSV sniffer sniffs the full first row for dialect detection

cslzchen · cslzchen · commit b650a1868309 · 2020-06-05T14:50:44.000-04:00
CSV sniffer used to sniff a fixed size of 2048 Bytes of any given
file. However, this led to wrong guessed delimiter if the first
row is larger than 2048 Bytes. The issue is fixed and accuracy is
also improved by allowing the sniffer to adaptively sniff the 1st
full row as long as it is within the max render file size.
diff --git a/mfr/extensions/tabular/libs/stdlib_tools.py b/mfr/extensions/tabular/libs/stdlib_tools.py
@@ -1,29 +1,47 @@
 import re
 import csv
+import logging
 
-from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
 from mfr.extensions.tabular import utilities
+from mfr.extensions.tabular.settings import MAX_FILE_SIZE, INIT_SNIFF_SIZE
+from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
+
+logger = logging.getLogger(__name__)
 
 
 def csv_stdlib(fp):
     """Read and convert a csv file to JSON format using the python standard library
-    :param fp: File pointer object
-    :return: tuple of table headers and data
+
+    Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to
+    effectively detect the correct dialect of the file.
+
+    :param fp: the file pointer object
+    :return: a tuple of table headers and data
     """
-    data = fp.read(2048)
+
+    # Prepare the first row for sniffing
+    data = fp.read(INIT_SNIFF_SIZE)
+    data = _trim_or_append_data(fp, data, INIT_SNIFF_SIZE, 0)
+
+    # Reset the file pointer
     fp.seek(0)
 
+    # Sniff the first row to find a matching format
     try:
         dialect = csv.Sniffer().sniff(data)
     except csv.Error:
         dialect = csv.excel
     else:
         _set_dialect_quote_attrs(dialect, data)
 
+    # Explicitly delete data when it is on longer used.
     del data
+
+    # Create the CSV reader with the detected dialect
     reader = csv.DictReader(fp, dialect=dialect)
+
+    # Update the reader field names to avoid duplicate column names when performing row extraction
     columns = []
-    # update the reader field names to avoid duplicate column names when performing row extraction
     for idx, fieldname in enumerate(reader.fieldnames or []):
         column_count = sum(1 for column in columns if fieldname == column['name'])
         if column_count:
@@ -92,3 +110,63 @@ def _set_dialect_quote_attrs(dialect, data):
             dialect.quotechar = '"'
         if re.search('"""[[({]\'.+\',', data):
             dialect.doublequote = True
+
+
+def _trim_or_append_data(fp, text, read_size, size_to_sniff, max_render_size=MAX_FILE_SIZE):
+    """Recursively read data from a file and return its full first row.  The file starts with
+    ``text`` and the file pointer points to the next character immediately after `text`.
+
+    :param fp: the file pointer from which data is read
+    :param text: the current text chunk to check the new line character
+    :param read_size: the last read size when `fp.read()` is called
+    :param size_to_sniff: the accumulated size fo the text to sniff
+    :param max_render_size: the max file size for render
+    :return: the first row of the file in string
+    """
+
+    # Return on empty text. This handles the corner case where the CSV is empty or only contains
+    # one line without any new line characters.
+    if len(text) == 0:
+        return ''
+
+    # Try to find the first new line character in the text chunk
+    index = _find_new_line(text)
+    # If found, return the trimmed substring
+    if index != -1:
+        return text[:index]
+    # Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text
+    size_to_sniff += read_size
+    read_size *= 2
+    more_text = fp.read(read_size)
+
+    # If text to sniff now goes over the max file size limit, raise the renderer error since there
+    # is no need to sniff when the file is already too large to be rendered.
+    if size_to_sniff + len(more_text) >= max_render_size:
+        raise TabularRendererError(
+            'The first row of this file is too large for the sniffer to detect the dialect. '
+            'Please download and view it locally.',
+            code=400,
+            extension='csv'
+        )
+    # If the size is still within the limit, recursively check `more_text`
+    return text + _trim_or_append_data(fp, more_text, read_size, size_to_sniff,
+                                       max_render_size=max_render_size)
+
+
+def _find_new_line(text):
+    """In the given text string, find the index of the first occurrence of any of the three types
+    of new line character. Note: '\n\r' is not a new line character but two, one LF and one CR.
+
+    1. \r\n     Carriage Return (CR) and Line Feed (LF), must be checked first
+    2. \n       LF
+    3. \r       CR
+
+    :param text: the text string to check
+    :return: the index of the first new line character if found. Otherwise, return -1.
+    """
+    index = text.find('\r\n')
+    if index == -1:
+        index = text.find('\n')
+        if index == -1:
+            index = text.find('\r')
+    return index
diff --git a/mfr/extensions/tabular/settings.py b/mfr/extensions/tabular/settings.py
@@ -4,11 +4,13 @@
 
 config = settings.child('TABULAR_EXTENSION_CONFIG')
 
-MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024))  # 10Mb
+MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024))  # 10MB
 MAX_SIZE = int(config.get('MAX_SIZE', 10000))  # max number of rows or columns allowed.
 TABLE_WIDTH = int(config.get('TABLE_WIDTH', 700))
 TABLE_HEIGHT = int(config.get('TABLE_HEIGHT', 600))
 
+INIT_SNIFF_SIZE = int(config.get('INIT_SNIFF_SIZE', 2 * 1024))  # 2KB
+
 LIBS = config.get_object('LIBS', {
     '.csv': [libs.csv_stdlib],
     '.tsv': [libs.csv_stdlib],