Skip to content

Commit 2da93f4

Browse files
committed
Make convertToAbsoluteInputBoxes depend on output tensor dimensions
In docTR/onnxTR models the width and height of the input image matches the "width" and "height" of the output tensor. So us using the pre-defined static dimensions was fine. But for some of the other models this doesn't work... One example is EasyOCR. There the output of the detection model is only a quarter of the input. While in that particular case it doesn't really matter, as the dimensions are reduced uniformly in both width and height, so the results are the same, but calculations are still using the wrong value. What does matter though is that the input width and height are not static. So we cannot get actual width and height from declarative input properties. In that case mismatch is easily noticeable. Another example is PaddleOCR. While width and height of the output matches that of the input, the size of the input is not static there either, which cause issues with boxes during testing. Since our assumptions on width and height being static and channel count being constant are broken, I've relaxed the output shape validation. So at this point the only place, where this validation does anything, is class count in orientation and recognition predictors. Kind of disappointing, but at least the remaining case helps can still help catching errors. Two test files were updated, as bumping float to double adjusted coordinates slightly.
1 parent 7244cf9 commit 2da93f4

File tree

4 files changed

+48
-28
lines changed

4 files changed

+48
-28
lines changed

pdfocr-onnxtr/src/main/java/com/itextpdf/pdfocr/onnxtr/detection/OnnxDetectionPredictor.java

Lines changed: 48 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ This file is part of the iText (R) project.
2525
import com.itextpdf.kernel.geom.Point;
2626
import com.itextpdf.pdfocr.onnxtr.AbstractOnnxPredictor;
2727
import com.itextpdf.pdfocr.onnxtr.FloatBufferMdArray;
28-
import com.itextpdf.pdfocr.onnxtr.OnnxInputProperties;
2928
import com.itextpdf.pdfocr.onnxtr.util.BufferedImageUtil;
3029
import com.itextpdf.pdfocr.onnxtr.util.MathUtil;
3130

@@ -40,6 +39,33 @@ This file is part of the iText (R) project.
4039
*/
4140
public class OnnxDetectionPredictor extends AbstractOnnxPredictor<BufferedImage, List<Point[]>>
4241
implements IDetectionPredictor {
42+
/**
43+
* The expected output shape (BCHW).
44+
*
45+
* <p>
46+
* Batch size is dynamic, as usual, so -1 there.
47+
*
48+
* <p>
49+
* For channels, ideally, there is just one "monochrome" image, but some
50+
* models put multiple different metrics in one output (ex. EasyOCR
51+
* returns 2), so we will assume dynamic size here as well.
52+
*
53+
* <p>
54+
* As for height and width, while in OnnxTR the dimensions are static and
55+
* are equal to the input image dimensions, this is not the case
56+
* everywhere. For example, in EasyOCR output is quarter of the input
57+
* resolution, but still static. On the other hand, in PaddleOCR, input
58+
* and output resolutions are the same, but they are dynamic. So we cannot
59+
* statically check this here without knowing the exact dimensions of the
60+
* input.
61+
*
62+
* <p>
63+
* Overall, this means, that the dimension checks for the output of the
64+
* models are useless here, except for checking, that there are 4
65+
* dimensions...
66+
*/
67+
private static final long[] EXPECTED_OUTPUT_SHAPE = new long[]{-1, -1, -1, -1};
68+
4369
/**
4470
* Configuration properties of the predictor.
4571
*/
@@ -51,7 +77,7 @@ public class OnnxDetectionPredictor extends AbstractOnnxPredictor<BufferedImage,
5177
* @param properties properties of the predictor
5278
*/
5379
public OnnxDetectionPredictor(OnnxDetectionPredictorProperties properties) {
54-
super(properties.getModelPath(), properties.getInputProperties(), getExpectedOutputShape(properties));
80+
super(properties.getModelPath(), properties.getInputProperties(), EXPECTED_OUTPUT_SHAPE);
5581
this.properties = properties;
5682
}
5783

@@ -187,6 +213,9 @@ protected FloatBufferMdArray toInputBuffer(List<BufferedImage> batch) {
187213
*/
188214
@Override
189215
protected List<List<Point[]>> fromOutputBuffer(List<BufferedImage> inputBatch, FloatBufferMdArray outputBatch) {
216+
final int batchWidth = outputBatch.getDimension(3);
217+
final int batchHeight = outputBatch.getDimension(2);
218+
final boolean usedSymmetricPadding = properties.getInputProperties().useSymmetricPad();
190219
final IDetectionPostProcessor postProcessor = properties.getPostProcessor();
191220
final List<List<Point[]>> batchTextBoxes = new ArrayList<>(inputBatch.size());
192221
for (int i = 0; i < inputBatch.size(); ++i) {
@@ -198,32 +227,35 @@ protected List<List<Point[]>> fromOutputBuffer(List<BufferedImage> inputBatch, F
198227
* absolute coordinates in the input image. This means, that we need
199228
* to revert resizing/padding changes as well.
200229
*/
201-
convertToAbsoluteInputBoxes(image, textBoxes, properties.getInputProperties());
230+
convertToAbsoluteInputBoxes(image, textBoxes, batchWidth, batchHeight, usedSymmetricPadding);
202231
batchTextBoxes.add(textBoxes);
203232
}
204233
return batchTextBoxes;
205234
}
206235

207-
private static void convertToAbsoluteInputBoxes(BufferedImage image, List<Point[]> boxes,
208-
OnnxInputProperties properties) {
209-
int sourceWidth = image.getWidth();
210-
int sourceHeight = image.getHeight();
211-
float targetWidth = properties.getWidth();
212-
float targetHeight = properties.getHeight();
213-
float widthRatio = targetWidth / sourceWidth;
214-
float heightRatio = targetHeight / sourceHeight;
215-
float widthScale;
216-
float heightScale;
236+
private static void convertToAbsoluteInputBoxes(
237+
BufferedImage image,
238+
List<Point[]> boxes,
239+
int batchWidth,
240+
int batchHeight,
241+
boolean usedSymmetricPadding
242+
) {
243+
final int sourceWidth = image.getWidth();
244+
final int sourceHeight = image.getHeight();
245+
final double widthRatio = (double) batchWidth / sourceWidth;
246+
final double heightRatio = (double) batchHeight / sourceHeight;
247+
final double widthScale;
248+
final double heightScale;
217249
// We preserve ratio, when resizing input
218250
if (heightRatio > widthRatio) {
219-
heightScale = targetHeight / (float) Math.round(sourceHeight * widthRatio);
251+
heightScale = batchHeight / (double) Math.round(sourceHeight * widthRatio);
220252
widthScale = 1;
221253
} else {
222-
widthScale = targetWidth / (float) Math.round(sourceWidth * heightRatio);
254+
widthScale = batchWidth / (double) Math.round(sourceWidth * heightRatio);
223255
heightScale = 1;
224256
}
225257
final Consumer<Point> updater;
226-
if (properties.useSymmetricPad()) {
258+
if (usedSymmetricPadding) {
227259
updater = p -> p.setLocation(
228260
MathUtil.clamp(sourceWidth * (0.5 + (p.getX() - 0.5) * widthScale), 0, sourceWidth),
229261
MathUtil.clamp(sourceHeight * (0.5 + (p.getY() - 0.5) * heightScale), 0, sourceHeight)
@@ -240,16 +272,4 @@ private static void convertToAbsoluteInputBoxes(BufferedImage image, List<Point[
240272
}
241273
}
242274
}
243-
244-
private static long[] getExpectedOutputShape(OnnxDetectionPredictorProperties properties) {
245-
final OnnxInputProperties inputProperties = properties.getInputProperties();
246-
// Dynamic batch size
247-
final long BATCH_SIZE = -1;
248-
// Output is "monochrome"
249-
final long CHANNEL_COUNT = 1;
250-
// Output retains the "image" dimension from the input
251-
final long height = inputProperties.getHeight();
252-
final long width = inputProperties.getWidth();
253-
return new long[]{BATCH_SIZE, CHANNEL_COUNT, height, width};
254-
}
255275
}

0 commit comments

Comments
 (0)