Skip to content

Commit 7a896cd

Browse files
committed
TIKA-3352 -- add a json option for the /tika endpoint
1 parent 9847d9b commit 7a896cd

File tree

14 files changed

+563
-11
lines changed

14 files changed

+563
-11
lines changed

CHANGES.txt

+2
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ Release 2.0.0-ALPHA - 01/13/2021
4444
This output is not available in tika-server-core.
4545

4646

47+
Release 1.27 - ??
4748

49+
* Add json output for /tika endpoint in tika-server (TIKA-3352).
4850
* Tika's OpenNLPDetector now covers 148 languages and language-script pairs (TIKA-3340).
4951

5052
Release 1.26 - 03/24/2021

tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java

+39
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import static org.junit.Assert.assertTrue;
2424

2525
import java.io.InputStream;
26+
import java.io.InputStreamReader;
2627
import java.nio.charset.StandardCharsets;
2728
import java.util.ArrayList;
2829
import java.util.List;
@@ -42,6 +43,10 @@
4243
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
4344
import org.junit.Test;
4445

46+
import org.apache.tika.metadata.Metadata;
47+
import org.apache.tika.metadata.OfficeOpenXMLExtended;
48+
import org.apache.tika.metadata.TikaCoreProperties;
49+
import org.apache.tika.metadata.serialization.JsonMetadata;
4550
import org.apache.tika.parser.ocr.TesseractOCRParser;
4651
import org.apache.tika.server.classic.config.PDFServerConfig;
4752
import org.apache.tika.server.classic.config.TesseractServerConfig;
@@ -50,6 +55,7 @@
5055
import org.apache.tika.server.core.config.DocumentSelectorConfig;
5156
import org.apache.tika.server.core.config.PasswordProviderConfig;
5257
import org.apache.tika.server.core.resource.TikaResource;
58+
import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
5359

5460
public class TikaResourceTest extends CXFTestBase {
5561
public static final String TEST_DOC = "test-documents/test.doc";
@@ -74,6 +80,7 @@ protected void setUpResources(JAXRSServerFactoryBean sf) {
7480
protected void setUpProviders(JAXRSServerFactoryBean sf) {
7581
List<Object> providers = new ArrayList<>();
7682
providers.add(new TikaServerParseExceptionMapper(false));
83+
providers.add(new JSONMessageBodyWriter());
7784
sf.setProviders(providers);
7885
}
7986

@@ -562,4 +569,36 @@ private MultipartBody testPDFLowerCaseOCRConfigPOSTBody() {
562569
return new MultipartBody(att);
563570
}
564571

572+
@Test
573+
public void testJson() throws Exception {
574+
Response response = WebClient.create(endPoint + TIKA_PATH + "/text")
575+
.accept("application/json")
576+
.put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
577+
Metadata metadata =
578+
JsonMetadata.fromJson(new InputStreamReader(
579+
((InputStream)response.getEntity()), StandardCharsets.UTF_8));
580+
assertContains("embed4.txt", metadata.get(TikaCoreProperties.TIKA_CONTENT));
581+
assertContains("General Congress", metadata.get(TikaCoreProperties.TIKA_CONTENT));
582+
assertNotFound("<p", metadata.get(TikaCoreProperties.TIKA_CONTENT));
583+
assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
584+
}
585+
586+
@Test
587+
public void testJsonWriteLimitEmbedded() throws Exception {
588+
Response response = WebClient.create(endPoint + TIKA_PATH + "/text")
589+
.accept("application/json")
590+
.header("writeLimit", "500")
591+
.put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
592+
Metadata metadata =
593+
JsonMetadata.fromJson(new InputStreamReader(
594+
((InputStream)response.getEntity()), StandardCharsets.UTF_8));
595+
assertContains("embed2a.txt", metadata.get(TikaCoreProperties.TIKA_CONTENT));
596+
assertContains("When in the Course", metadata.get(TikaCoreProperties.TIKA_CONTENT));
597+
assertNotFound("declare the causes", metadata.get(TikaCoreProperties.TIKA_CONTENT));
598+
assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
599+
assertTrue(metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION).startsWith(
600+
"org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException"));
601+
assertNotFound("embed4.txt", metadata.get(TikaCoreProperties.TIKA_CONTENT));
602+
603+
}
565604
}

tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ private static ServerDetails initServer(TikaServerConfig tikaServerConfig,
232232

233233
serverThread.start();
234234
}
235-
TikaResource.init(tika, digester, inputStreamFactory, serverStatus);
235+
TikaResource.init(tika, tikaServerConfig, digester, inputStreamFactory, serverStatus);
236236
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
237237

238238
List<ResourceProvider> resourceProviders = new ArrayList<>();

tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@
5151
@Path("/rmeta")
5252
public class RecursiveMetadataResource {
5353

54-
private static final String HANDLER_TYPE_PARAM = "handler";
55-
private static final BasicContentHandlerFactory.HANDLER_TYPE DEFAULT_HANDLER_TYPE =
54+
protected static final String HANDLER_TYPE_PARAM = "handler";
55+
protected static final BasicContentHandlerFactory.HANDLER_TYPE DEFAULT_HANDLER_TYPE =
5656
BasicContentHandlerFactory.HANDLER_TYPE.XML;
5757
private static final Logger LOG = LoggerFactory.getLogger(RecursiveMetadataResource.class);
5858

tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java

+91-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
package org.apache.tika.server.core.resource;
1919

2020
import static java.nio.charset.StandardCharsets.UTF_8;
21+
import static org.apache.tika.server.core.resource.RecursiveMetadataResource.DEFAULT_HANDLER_TYPE;
22+
import static org.apache.tika.server.core.resource.RecursiveMetadataResource.HANDLER_TYPE_PARAM;
2123

2224
import java.io.IOException;
2325
import java.io.InputStream;
@@ -36,6 +38,7 @@
3638
import javax.ws.rs.POST;
3739
import javax.ws.rs.PUT;
3840
import javax.ws.rs.Path;
41+
import javax.ws.rs.PathParam;
3942
import javax.ws.rs.Produces;
4043
import javax.ws.rs.WebApplicationException;
4144
import javax.ws.rs.core.Context;
@@ -62,12 +65,14 @@
6265
import org.apache.tika.Tika;
6366
import org.apache.tika.config.TikaConfig;
6467
import org.apache.tika.exception.EncryptedDocumentException;
68+
import org.apache.tika.exception.TikaException;
6569
import org.apache.tika.metadata.Metadata;
6670
import org.apache.tika.metadata.TikaCoreProperties;
6771
import org.apache.tika.parser.AutoDetectParser;
6872
import org.apache.tika.parser.DigestingParser;
6973
import org.apache.tika.parser.ParseContext;
7074
import org.apache.tika.parser.Parser;
75+
import org.apache.tika.sax.BasicContentHandlerFactory;
7176
import org.apache.tika.sax.BodyContentHandler;
7277
import org.apache.tika.sax.ExpandedTitleContentHandler;
7378
import org.apache.tika.sax.RichTextContentHandler;
@@ -76,7 +81,9 @@
7681
import org.apache.tika.server.core.InputStreamFactory;
7782
import org.apache.tika.server.core.ParseContextConfig;
7883
import org.apache.tika.server.core.ServerStatus;
84+
import org.apache.tika.server.core.TikaServerConfig;
7985
import org.apache.tika.server.core.TikaServerParseException;
86+
import org.apache.tika.utils.ExceptionUtils;
8087

8188
@Path("/tika")
8289
public class TikaResource {
@@ -87,16 +94,19 @@ public class TikaResource {
8794
private static final Logger LOG = LoggerFactory.getLogger(TikaResource.class);
8895
private static Pattern ALLOWABLE_HEADER_CHARS = Pattern.compile("(?i)^[-/_+\\.A-Z0-9 ]+$");
8996
private static TikaConfig tikaConfig;
97+
private static TikaServerConfig tikaServerConfig;
9098
private static DigestingParser.Digester digester = null;
9199
private static InputStreamFactory inputStreamFactory = null;
92100
private static ServerStatus SERVER_STATUS = null;
93101

94102
private static ParseContextConfig PARSE_CONTEXT_CONFIG = new CompositeParseContextConfig();
95103

96104

97-
public static void init(TikaConfig config, DigestingParser.Digester digestr,
105+
public static void init(TikaConfig config, TikaServerConfig tikaServerConfg,
106+
DigestingParser.Digester digestr,
98107
InputStreamFactory iSF, ServerStatus serverStatus) {
99108
tikaConfig = config;
109+
tikaServerConfig = tikaServerConfg;
100110
digester = digestr;
101111
inputStreamFactory = iSF;
102112
SERVER_STATUS = serverStatus;
@@ -508,6 +518,86 @@ public StreamingOutput getXML(final InputStream is, @Context HttpHeaders httpHea
508518
httpHeaders.getRequestHeaders(), info, "xml");
509519
}
510520

521+
@POST
522+
@Consumes("multipart/form-data")
523+
@Produces("application/json")
524+
@Path("form{" + HANDLER_TYPE_PARAM + " : (\\w+)?}")
525+
public Metadata getJsonFromMultipart(Attachment att,
526+
@Context HttpHeaders httpHeaders,
527+
@Context final UriInfo info,
528+
@PathParam(HANDLER_TYPE_PARAM)
529+
String handlerTypeName)
530+
throws IOException, TikaException {
531+
Metadata metadata = new Metadata();
532+
parseToMetadata(getInputStream(att.getObject(InputStream.class), metadata, httpHeaders),
533+
metadata, preparePostHeaderMap(att, httpHeaders), info, handlerTypeName);
534+
TikaResource.getConfig().getMetadataFilter().filter(metadata);
535+
return metadata;
536+
}
537+
538+
@PUT
539+
@Consumes("*/*")
540+
@Produces("application/json")
541+
@Path("{" + HANDLER_TYPE_PARAM + " : (\\w+)?}")
542+
public Metadata getJson(final InputStream is, @Context
543+
HttpHeaders httpHeaders,
544+
@Context final UriInfo info, @PathParam(HANDLER_TYPE_PARAM)
545+
String handlerTypeName)
546+
throws IOException, TikaException {
547+
Metadata metadata = new Metadata();
548+
parseToMetadata(getInputStream(is, metadata, httpHeaders), metadata,
549+
httpHeaders.getRequestHeaders(), info, handlerTypeName);
550+
TikaResource.getConfig().getMetadataFilter().filter(metadata);
551+
return metadata;
552+
}
553+
554+
private void parseToMetadata(InputStream inputStream,
555+
Metadata metadata,
556+
MultivaluedMap<String, String> httpHeaders,
557+
UriInfo info, String handlerTypeName) throws IOException {
558+
final Parser parser = createParser();
559+
final ParseContext context = new ParseContext();
560+
561+
fillMetadata(parser, metadata, httpHeaders);
562+
fillParseContext(httpHeaders, metadata, context);
563+
564+
logRequest(LOG, "/tika", metadata);
565+
int writeLimit = -1;
566+
if (httpHeaders.containsKey("writeLimit")) {
567+
writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
568+
}
569+
BasicContentHandlerFactory.HANDLER_TYPE type =
570+
BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
571+
BasicContentHandlerFactory fact = new BasicContentHandlerFactory(type, writeLimit);
572+
ContentHandler contentHandler = fact.getNewContentHandler();
573+
574+
try {
575+
parse(parser, LOG, info.getPath(), inputStream, contentHandler, metadata, context);
576+
} catch (TikaServerParseException e) {
577+
if (tikaServerConfig.isReturnStackTrace()) {
578+
Throwable cause = e.getCause();
579+
if (cause != null) {
580+
metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
581+
ExceptionUtils.getStackTrace(cause));
582+
} else {
583+
metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
584+
ExceptionUtils.getStackTrace(e));
585+
}
586+
} else {
587+
throw e;
588+
}
589+
} catch (OutOfMemoryError e) {
590+
if (tikaServerConfig.isReturnStackTrace()) {
591+
metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
592+
ExceptionUtils.getStackTrace(e));
593+
} else {
594+
throw e;
595+
}
596+
} finally {
597+
metadata.add(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString());
598+
}
599+
}
600+
511601
private StreamingOutput produceOutput(final InputStream is, Metadata metadata,
512602
final MultivaluedMap<String, String> httpHeaders,
513603
final UriInfo info, final String format) {

tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java

+12-3
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@
5757

5858
public abstract class CXFTestBase {
5959
protected static final String endPoint = "http://localhost:" + TikaServerConfig.DEFAULT_PORT;
60-
private final static int DIGESTER_READ_LIMIT = 20 * 1024 * 1024;
60+
protected final static int DIGESTER_READ_LIMIT = 20 * 1024 * 1024;
6161
protected Server server;
62-
private TikaConfig tika;
62+
protected TikaConfig tika;
6363

6464
public static void assertContains(String needle, String haystack) {
6565
assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
@@ -101,7 +101,10 @@ public static InputStream gzip(InputStream is) throws IOException {
101101
public void setUp() throws Exception {
102102

103103
this.tika = new TikaConfig(getTikaConfigInputStream());
104-
TikaResource.init(tika, new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
104+
TikaServerConfig tikaServerConfig = getTikaServerConfig();
105+
TikaResource.init(tika, tikaServerConfig,
106+
new CommonsDigester(DIGESTER_READ_LIMIT, "md5," +
107+
"sha1:32"),
105108
getInputStreamFactory(tika), new ServerStatus("", 0, true));
106109
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
107110
//set compression interceptors
@@ -121,6 +124,12 @@ public void setUp() throws Exception {
121124
server = sf.create();
122125
}
123126

127+
protected TikaServerConfig getTikaServerConfig() {
128+
TikaServerConfig tikaServerConfig = new TikaServerConfig();
129+
tikaServerConfig.setReturnStackTrace(true);
130+
return tikaServerConfig;
131+
}
132+
124133
protected InputStreamFactory getInputStreamFactory(TikaConfig tikaConfig) {
125134
return new DefaultInputStreamFactory();
126135
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.tika.server.core;
19+
20+
import static java.nio.charset.StandardCharsets.UTF_8;
21+
import static org.junit.Assert.assertEquals;
22+
23+
import java.io.InputStream;
24+
import java.io.InputStreamReader;
25+
import java.io.Reader;
26+
import java.util.ArrayList;
27+
import java.util.List;
28+
import javax.ws.rs.core.Response;
29+
30+
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
31+
import org.apache.cxf.jaxrs.client.WebClient;
32+
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
33+
import org.junit.Test;
34+
35+
import org.apache.tika.metadata.Metadata;
36+
import org.apache.tika.metadata.TikaCoreProperties;
37+
import org.apache.tika.metadata.serialization.JsonMetadataList;
38+
import org.apache.tika.server.core.resource.RecursiveMetadataResource;
39+
import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
40+
41+
public class RecursiveMetadataResourceTest extends CXFTestBase {
42+
43+
private static final String META_PATH = "/rmeta";
44+
45+
public static final String TEST_NULL_POINTER = "test-documents/mock/null_pointer.xml";
46+
47+
@Override
48+
protected void setUpResources(JAXRSServerFactoryBean sf) {
49+
sf.setResourceClasses(RecursiveMetadataResource.class);
50+
sf.setResourceProvider(RecursiveMetadataResource.class,
51+
new SingletonResourceProvider(new RecursiveMetadataResource()));
52+
}
53+
54+
@Override
55+
protected void setUpProviders(JAXRSServerFactoryBean sf) {
56+
List<Object> providers = new ArrayList<>();
57+
providers.add(new MetadataListMessageBodyWriter());
58+
sf.setProviders(providers);
59+
}
60+
61+
@Test
62+
public void testNPE() throws Exception {
63+
Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
64+
.put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
65+
66+
Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
67+
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
68+
Metadata metadata = metadataList.get(0);
69+
assertEquals("Nikolai Lobachevsky", metadata.get("author"));
70+
assertEquals("application/mock+xml", metadata.get(Metadata.CONTENT_TYPE));
71+
assertContains("some content", metadata.get(TikaCoreProperties.TIKA_CONTENT));
72+
assertContains("null pointer message",
73+
metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION));
74+
75+
}
76+
77+
}

tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java

+10-2
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,11 @@ public void testEncrypted() throws Exception {
8787
if ("/rmeta".equals(path)) {
8888
continue;
8989
}
90-
Response response = WebClient.create(endPoint + path).accept("*/*")
90+
String accept = "*/*";
91+
if ("/tika".equals(path)) {
92+
accept = "text/plain";
93+
}
94+
Response response = WebClient.create(endPoint + path).accept(accept)
9195
.header("Content-Disposition",
9296
"attachment; filename=" + TEST_PASSWORD_PROTECTED)
9397
.put(ClassLoader.getSystemResourceAsStream(TEST_PASSWORD_PROTECTED));
@@ -104,7 +108,11 @@ public void testNullPointerOnTika() throws Exception {
104108
if ("/rmeta".equals(path)) {
105109
continue;
106110
}
107-
Response response = WebClient.create(endPoint + path).accept("*/*")
111+
String accept = "*/*";
112+
if ("/tika".equals(path)) {
113+
accept = "text/plain";
114+
}
115+
Response response = WebClient.create(endPoint + path).accept(accept)
108116
.put(ClassLoader.getSystemResourceAsStream(TEST_NULL));
109117
assertNotNull("null response: " + path, response);
110118
assertEquals("unprocessable: " + path, UNPROCESSEABLE, response.getStatus());

0 commit comments

Comments
 (0)