Skip to content

Commit

Permalink
fix #994 Allow UTF8StreamJsonParser to be used without canonicalization
Browse files Browse the repository at this point in the history
Previously, the ReaderBasedJsonParser was used instead, which
is less performant when reading from an InputStream (and handling
charset decoding in addition to json parsing).

This commit updates the JsonFactory factory methods to respect
the canonicalization configuration, where previously a canonicalizing
implementaiton was always used.

I have added guards around both `_symbols.addName` and
`_symbols.findName` based on the existing implementation from
`SmileParser`. For correctness, only the guards around `addName`
are required, but we avoid unnecessary hashing by guarding
both.
  • Loading branch information
carterkozak committed Apr 21, 2023
1 parent 2b41925 commit 024c3ca
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 55 deletions.
6 changes: 3 additions & 3 deletions src/main/java/com/fasterxml/jackson/core/JsonFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -1299,7 +1299,7 @@ public JsonParser createNonBlockingByteArrayParser() throws IOException
// for non-JSON input:
_requireJSONFactory("Non-blocking source not (yet?) supported for this format (%s)");
IOContext ctxt = _createNonBlockingContext(null);
ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChildOrPlaceholder(_factoryFeatures);
return new NonBlockingJsonParser(ctxt, _parserFeatures, can);
}

Expand All @@ -1326,7 +1326,7 @@ public JsonParser createNonBlockingByteBufferParser() throws IOException
// for non-JSON input:
_requireJSONFactory("Non-blocking source not (yet?) supported for this format (%s)");
IOContext ctxt = _createNonBlockingContext(null);
ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChildOrPlaceholder(_factoryFeatures);
return new NonBlockingByteBufferJsonParser(ctxt, _parserFeatures, can);
}

Expand Down Expand Up @@ -1849,7 +1849,7 @@ protected JsonParser _createParser(DataInput input, IOContext ctxt) throws IOExc
// Also: while we can't do full bootstrapping (due to read-ahead limitations), should
// at least handle possible UTF-8 BOM
int firstByte = ByteSourceJsonBootstrapper.skipUTF8BOM(input);
ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChildOrPlaceholder(_factoryFeatures);
return new UTF8DataInputJsonParser(ctxt, _parserFeatures, input,
_objectCodec, can, firstByte);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,14 +257,9 @@ public JsonParser constructParser(int parserFeatures, ObjectCodec codec,
int bytesProcessed = _inputPtr - prevInputPtr;

if (enc == JsonEncoding.UTF8) {
/* and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader
* (which is ok for larger input; not so hot for smaller; but this is not a common case)
*/
if (JsonFactory.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(factoryFeatures)) {
ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);
return new UTF8StreamJsonParser(_context, parserFeatures, _in, codec, can,
_inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);
}
ByteQuadsCanonicalizer can = rootByteSymbols.makeChildOrPlaceholder(factoryFeatures);
return new UTF8StreamJsonParser(_context, parserFeatures, _in, codec, can,
_inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);
}
return new ReaderBasedJsonParser(_context, parserFeatures, constructReader(), codec,
rootCharSymbols.makeChild(factoryFeatures));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ public class UTF8DataInputJsonParser
*/
final protected ByteQuadsCanonicalizer _symbols;

/**
* Marker flag to indicate that standard symbol handling is used
* (one with symbol table assisted canonicalization. May be disabled
* in which case alternate stream-line, non-canonicalizing handling
* is used: usually due to set of symbols
* (Object property names) is unbounded and will not benefit from
* canonicalization attempts.
*
* @since TODO(ckozak): determine target version, if accepted.
*/
final protected boolean _symbolsCanonical;

/*
/**********************************************************
/* Parsing state
Expand Down Expand Up @@ -127,6 +139,7 @@ public UTF8DataInputJsonParser(IOContext ctxt, int features, DataInput inputData
super(ctxt, features);
_objectCodec = codec;
_symbols = sym;
_symbolsCanonical = sym.isCanonicalizing();
_inputData = inputData;
_nextByte = firstByte;
}
Expand Down Expand Up @@ -1580,7 +1593,7 @@ protected final String parseEscapedName(int[] quads, int qlen, int currQuad, int
}
quads[qlen++] = pad(currQuad, currQuadBytes);
}
String name = _symbols.findName(quads, qlen);
String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
if (name == null) {
name = addName(quads, qlen, currQuadBytes);
}
Expand Down Expand Up @@ -1654,7 +1667,7 @@ protected String _handleOddName(int ch) throws IOException
}
quads[qlen++] = currQuad;
}
String name = _symbols.findName(quads, qlen);
String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
if (name == null) {
name = addName(quads, qlen, currQuadBytes);
}
Expand Down Expand Up @@ -1754,7 +1767,7 @@ protected String _parseAposName() throws IOException
}
quads[qlen++] = pad(currQuad, currQuadBytes);
}
String name = _symbols.findName(quads, qlen);
String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
if (name == null) {
name = addName(quads, qlen, currQuadBytes);
}
Expand All @@ -1771,10 +1784,12 @@ private final String findName(int q1, int lastQuadBytes)
throws JsonParseException, StreamConstraintsException
{
q1 = pad(q1, lastQuadBytes);
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1);
if (name != null) {
return name;
if (_symbolsCanonical) {
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1);
if (name != null) {
return name;
}
}
// If not, more work. We'll need add stuff to buffer
_quadBuffer[0] = q1;
Expand All @@ -1785,10 +1800,12 @@ private final String findName(int q1, int q2, int lastQuadBytes)
throws JsonParseException, StreamConstraintsException
{
q2 = pad(q2, lastQuadBytes);
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1, q2);
if (name != null) {
return name;
if (_symbolsCanonical) {
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1, q2);
if (name != null) {
return name;
}
}
// If not, more work. We'll need add stuff to buffer
_quadBuffer[0] = q1;
Expand All @@ -1800,9 +1817,11 @@ private final String findName(int q1, int q2, int q3, int lastQuadBytes)
throws JsonParseException, StreamConstraintsException
{
q3 = pad(q3, lastQuadBytes);
String name = _symbols.findName(q1, q2, q3);
if (name != null) {
return name;
if (_symbolsCanonical) {
String name = _symbols.findName(q1, q2, q3);
if (name != null) {
return name;
}
}
int[] quads = _quadBuffer;
quads[0] = q1;
Expand All @@ -1818,7 +1837,7 @@ private final String findName(int[] quads, int qlen, int lastQuad, int lastQuadB
_quadBuffer = quads = _growArrayBy(quads, quads.length);
}
quads[qlen++] = pad(lastQuad, lastQuadBytes);
String name = _symbols.findName(quads, qlen);
String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
if (name == null) {
return addName(quads, qlen, lastQuadBytes);
}
Expand Down Expand Up @@ -1933,6 +1952,9 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)

// Ok. Now we have the character array, and can construct the String
String baseName = new String(cbuf, 0, cix);
if (!_symbolsCanonical) {
return baseName;
}
// And finally, un-align if necessary
if (lastQuadBytes < 4) {
quads[qlen-1] = lastQuad;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,18 @@ public class UTF8StreamJsonParser
*/
final protected ByteQuadsCanonicalizer _symbols;

/**
* Marker flag to indicate that standard symbol handling is used
* (one with symbol table assisted canonicalization. May be disabled
* in which case alternate stream-line, non-canonicalizing handling
* is used: usually due to set of symbols
* (Object property names) is unbounded and will not benefit from
* canonicalization attempts.
*
* @since TODO(ckozak): determine target version, if accepted.
*/
final protected boolean _symbolsCanonical;

/*
/**********************************************************
/* Parsing state
Expand Down Expand Up @@ -192,6 +204,7 @@ public UTF8StreamJsonParser(IOContext ctxt, int features, InputStream in,
_inputStream = in;
_objectCodec = codec;
_symbols = sym;
_symbolsCanonical = sym.isCanonicalizing();
_inputBuffer = inputBuffer;
_inputPtr = start;
_inputEnd = end;
Expand Down Expand Up @@ -2112,7 +2125,7 @@ protected final String parseEscapedName(int[] quads, int qlen, int currQuad, int
}
quads[qlen++] = _padLastQuad(currQuad, currQuadBytes);
}
String name = _symbols.findName(quads, qlen);
String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
if (name == null) {
name = addName(quads, qlen, currQuadBytes);
}
Expand Down Expand Up @@ -2192,7 +2205,7 @@ protected String _handleOddName(int ch) throws IOException
}
quads[qlen++] = currQuad;
}
String name = _symbols.findName(quads, qlen);
String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
if (name == null) {
name = addName(quads, qlen, currQuadBytes);
}
Expand Down Expand Up @@ -2297,7 +2310,7 @@ protected String _parseAposName() throws IOException
}
quads[qlen++] = _padLastQuad(currQuad, currQuadBytes);
}
String name = _symbols.findName(quads, qlen);
String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
if (name == null) {
name = addName(quads, qlen, currQuadBytes);
}
Expand All @@ -2314,10 +2327,12 @@ private final String findName(int q1, int lastQuadBytes)
throws JsonParseException, StreamConstraintsException
{
q1 = _padLastQuad(q1, lastQuadBytes);
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1);
if (name != null) {
return name;
if (_symbolsCanonical) {
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1);
if (name != null) {
return name;
}
}
// If not, more work. We'll need add stuff to buffer
_quadBuffer[0] = q1;
Expand All @@ -2328,10 +2343,12 @@ private final String findName(int q1, int q2, int lastQuadBytes)
throws JsonParseException, StreamConstraintsException
{
q2 = _padLastQuad(q2, lastQuadBytes);
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1, q2);
if (name != null) {
return name;
if (_symbolsCanonical) {
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1, q2);
if (name != null) {
return name;
}
}
// If not, more work. We'll need add stuff to buffer
_quadBuffer[0] = q1;
Expand All @@ -2343,9 +2360,11 @@ private final String findName(int q1, int q2, int q3, int lastQuadBytes)
throws JsonParseException, StreamConstraintsException
{
q3 = _padLastQuad(q3, lastQuadBytes);
String name = _symbols.findName(q1, q2, q3);
if (name != null) {
return name;
if (_symbolsCanonical) {
String name = _symbols.findName(q1, q2, q3);
if (name != null) {
return name;
}
}
int[] quads = _quadBuffer;
quads[0] = q1;
Expand All @@ -2361,7 +2380,7 @@ private final String findName(int[] quads, int qlen, int lastQuad, int lastQuadB
_quadBuffer = quads = growArrayBy(quads, quads.length);
}
quads[qlen++] = _padLastQuad(lastQuad, lastQuadBytes);
String name = _symbols.findName(quads, qlen);
String name = _symbolsCanonical ? _symbols.findName(quads, qlen) : null;
if (name == null) {
return addName(quads, qlen, lastQuadBytes);
}
Expand Down Expand Up @@ -2475,6 +2494,9 @@ private final String addName(int[] quads, int qlen, int lastQuadBytes)

// Ok. Now we have the character array, and can construct the String
String baseName = new String(cbuf, 0, cix);
if (!_symbolsCanonical) {
return baseName;
}
// And finally, un-align if necessary
if (lastQuadBytes < 4) {
quads[qlen-1] = lastQuad;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,18 @@ public abstract class NonBlockingJsonParserBase
*/
final protected ByteQuadsCanonicalizer _symbols;

/**
* Marker flag to indicate that standard symbol handling is used
* (one with symbol table assisted canonicalization. May be disabled
* in which case alternate stream-line, non-canonicalizing handling
* is used: usually due to set of symbols
* (Object property names) is unbounded and will not benefit from
* canonicalization attempts.
*
* @since TODO(ckozak): determine target version, if accepted.
*/
final protected boolean _symbolsCanonical;

/**
* Temporary buffer used for name parsing.
*/
Expand Down Expand Up @@ -257,6 +269,7 @@ public NonBlockingJsonParserBase(IOContext ctxt, int parserFeatures,
{
super(ctxt, parserFeatures);
_symbols = sym;
_symbolsCanonical = sym.isCanonicalizing();
_currToken = null;
_majorState = MAJOR_INITIAL;
_majorStateAfterValue = MAJOR_ROOT;
Expand Down Expand Up @@ -644,10 +657,12 @@ protected final String _findName(int q1, int lastQuadBytes)
throws JsonParseException, StreamConstraintsException
{
q1 = _padLastQuad(q1, lastQuadBytes);
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1);
if (name != null) {
return name;
if (_symbolsCanonical) {
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1);
if (name != null) {
return name;
}
}
// If not, more work. We'll need add stuff to buffer
_quadBuffer[0] = q1;
Expand All @@ -658,10 +673,12 @@ protected final String _findName(int q1, int q2, int lastQuadBytes)
throws JsonParseException, StreamConstraintsException
{
q2 = _padLastQuad(q2, lastQuadBytes);
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1, q2);
if (name != null) {
return name;
if (_symbolsCanonical) {
// Usually we'll find it from the canonical symbol table already
String name = _symbols.findName(q1, q2);
if (name != null) {
return name;
}
}
// If not, more work. We'll need add stuff to buffer
_quadBuffer[0] = q1;
Expand All @@ -673,9 +690,11 @@ protected final String _findName(int q1, int q2, int q3, int lastQuadBytes)
throws JsonParseException, StreamConstraintsException
{
q3 = _padLastQuad(q3, lastQuadBytes);
String name = _symbols.findName(q1, q2, q3);
if (name != null) {
return name;
if (_symbolsCanonical) {
String name = _symbols.findName(q1, q2, q3);
if (name != null) {
return name;
}
}
int[] quads = _quadBuffer;
quads[0] = q1;
Expand Down Expand Up @@ -790,6 +809,9 @@ protected final String _addName(int[] quads, int qlen, int lastQuadBytes)

// Ok. Now we have the character array, and can construct the String
String baseName = new String(cbuf, 0, cix);
if (!_symbolsCanonical) {
return baseName;
}
// And finally, un-align if necessary
if (lastQuadBytes < 4) {
quads[qlen-1] = lastQuad;
Expand Down
Loading

0 comments on commit 024c3ca

Please sign in to comment.