Skip to content

Commit

Permalink
feat: add in mini parser for quoteless
Browse files Browse the repository at this point in the history
  • Loading branch information
willfarrell committed Aug 27, 2022
1 parent d0d1428 commit a274ba5
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 10 deletions.
14 changes: 8 additions & 6 deletions bin/esbuild
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#!/usr/bin/env sh

node_modules/.bin/esbuild --platform=node --target=node14 --format=cjs index.js --minify --allow-overwrite --outfile=index.cjs
node_modules/.bin/esbuild --platform=node --target=node14 --format=cjs parse.js --bundle --minify --allow-overwrite --outfile=parse.cjs
node_modules/.bin/esbuild --platform=node --target=node14 --format=cjs format.js --bundle --minify --allow-overwrite --outfile=format.cjs
node_modules/.bin/esbuild --platform=node --target=node14 --format=cjs index.js --minify --allow-overwrite --outfile=index.cjs
node_modules/.bin/esbuild --platform=node --target=node14 --format=cjs parse.js --bundle --minify --allow-overwrite --outfile=parse.cjs
node_modules/.bin/esbuild --platform=node --target=node14 --format=cjs parse-mini.js --bundle --minify --allow-overwrite --outfile=parse-mini.cjs
node_modules/.bin/esbuild --platform=node --target=node14 --format=cjs format.js --bundle --minify --allow-overwrite --outfile=format.cjs

node_modules/.bin/esbuild --platform=node --format=esm index.js --minify --sourcemap=external --allow-overwrite --outfile=index.mjs
node_modules/.bin/esbuild --platform=node --format=esm parse.js --bundle --minify --sourcemap=external --allow-overwrite --outfile=parse.mjs
node_modules/.bin/esbuild --platform=node --format=esm format.js --bundle --minify --sourcemap=external --allow-overwrite --outfile=format.mjs
node_modules/.bin/esbuild --platform=node --format=esm index.js --minify --sourcemap=external --allow-overwrite --outfile=index.mjs
node_modules/.bin/esbuild --platform=node --format=esm parse.js --bundle --minify --sourcemap=external --allow-overwrite --outfile=parse.mjs
node_modules/.bin/esbuild --platform=node --format=esm parse-mini.js --bundle --minify --sourcemap=external --allow-overwrite --outfile=parse-mini.mjs
node_modules/.bin/esbuild --platform=node --format=esm format.js --bundle --minify --sourcemap=external --allow-overwrite --outfile=format.mjs

3 changes: 3 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
// import {TextDecoder} from 'node:util'
// import {defaultOptions, optionDetectNewlineValue} from './options.js'
import csvParse from 'csv-rex/parse'
import csvParseMini from 'csv-rex/parse-mini'
import csvFormat from 'csv-rex/format'

export const parse = csvParse
export const parseMini = csvParseMini
export const format = csvFormat

export default {
parse: csvParse,
parseMini: csvParseMini,
format: csvFormat
}
2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 11 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "csv-rex",
"version": "0.3.1",
"version": "0.4.0",
"description": "A tiny and fast CSV parser for JavaScript.",
"type": "module",
"files": [
Expand All @@ -26,6 +26,14 @@
"default": "./parse.cjs"
}
},
"./parse-mini": {
"import": {
"default": "./parse-mini.mjs"
},
"require": {
"default": "./parse-mini.cjs"
}
},
"./format": {
"import": {
"default": "./format.mjs"
Expand All @@ -41,7 +49,8 @@
"pre-commit": "lint-staged",
"lint": "prettier --write *.{js,json} && standard --fix *.js",
"test": "c8 node --test",
"build": "./bin/esbuild"
"build": "./bin/esbuild",
"bench": "npm run build && node parse.bench.js"
},
"repository": {
"type": "git",
Expand Down
154 changes: 154 additions & 0 deletions parse-mini.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
// chunkSize >> largest expected row
const defaultOptions = {
header: true, // false: return array; true: detect headers and return json; [...]: use defined headers and return json
newlineChar: '\r\n', // undefined: detect newline from file; '\r\n': Windows; '\n': Linux/Mac
delimiterChar: '',
// quoteChar: '"',
// escapeChar: '"', // default: `quoteChar`

// Parse
emptyFieldValue: '',
coerceField: (field) => field, // TODO tests
// commentPrefixValue: false, // falsy: disable, '//': enabled
// errorOnComment: true,
// errorOnEmptyLine: true,
errorOnFieldsMismatch: true
// errorOnFieldMalformed: true
}

const length = (value) => value.length

export const parse = (opts = {}) => {
const options = { ...defaultOptions, ...opts }
options.escapeChar ??= options.quoteChar

let { header, newlineChar, delimiterChar } = options
let headerLength = length(header)
const {
// quoteChar,
// escapeChar,
// commentPrefixValue,
emptyFieldValue,
coerceField,
// errorOnEmptyLine,
// errorOnComment,
errorOnFieldsMismatch
// errorOnFieldMalformed
} = options

let chunk, enqueue
let partialLine = ''
let idx = 0
const enqueueRow = (row) => {
let data = row
idx += 1
if (headerLength) {
const rowLength = length(row)

if (headerLength !== rowLength) {
if (errorOnFieldsMismatch) {
enqueueError(
'FieldsMismatch',
`Incorrect number of fields parsed, expected ${headerLength}.`
)
}
return
} else {
data = {}
for (let i = 0; i < rowLength; i++) {
data[header[i]] = row[i]
}
}
}
enqueue({ idx, data })
}

const enqueueError = (code, message) => {
enqueue({ idx, err: { code, message } })
}

const transformField = (field, idx) => {
return coerceField(field || emptyFieldValue, idx)
}

const chunkParse = (string, controller) => {
chunk = string
enqueue = controller.enqueue
const lines = chunk.split(newlineChar) // TODO use cursor pattern
let linesLength = length(lines)
if (linesLength > 1) {
partialLine = lines.pop()
linesLength -= 1
}

let i = 0
if (header === true) {
header = lines[i].split(delimiterChar)
headerLength = length(header)
i += 1
}

for (; i < linesLength; i++) {
const line = lines[i]
const row = []
let cursor = 0
while (cursor < line.length) {
const delimiterIndex = line.indexOf(delimiterChar, cursor)
if (delimiterIndex === -1) {
row.push(transformField(line.substring(cursor), row.length))
break
}
row.push(
transformField(line.substring(cursor, delimiterIndex), row.length)
)
cursor = delimiterIndex + 1
}
enqueueRow(row)
}
}

return {
chunkParse,
header: () => header,
previousChunk: () => partialLine
}
}

export default (input, opts) => {
const options = {
...defaultOptions,
...{
enableReturn: true,
chunkSize: 64 * 1024 * 1024,
enqueue: () => {}
},
...opts
}
const { chunkSize, enableReturn, enqueue } = options
const { chunkParse, previousChunk } = parse(options)

const res = []
const controller = { enqueue }

if (enableReturn) {
controller.enqueue = (row) => {
enqueue(row)
res.push(row.data)
}
}

let position = 0
while (position < input.length) {
const chunk =
previousChunk() + input.substring(position, position + chunkSize)

// Checking if you can use fastParse slows it down more than checking for quoteChar on ever field.
chunkParse(chunk, controller)
position += chunkSize
}
// flush
const chunk = previousChunk()
chunkParse(chunk, controller, true)

return enableReturn && res
}
1 change: 0 additions & 1 deletion parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,6 @@ export const detectChar = (chunk, pattern) => {
const chars = {}
while ((match = pattern.exec(chunk))) {
const char = match[0]
console.log({ char, chars })
chars[char] ??= 0
chars[char] += 1
if (chars[char] > 5) return char
Expand Down

0 comments on commit a274ba5

Please sign in to comment.