Skip to content

Commit 6a098b7

Browse files
authoredDec 4, 2024
TIKA-4357 -- improve metadata key prefixing for PDFs and html (#2061)
* TIKA-4357 -- improve metadata key prefixing for PDFs and html * TIKA-4357 -- fix unit test
1 parent 3806e55 commit 6a098b7

File tree

6 files changed

+21
-25
lines changed

6 files changed

+21
-25
lines changed
 

‎tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ public void testMetadataOutput() throws Exception {
219219
public void testJsonMetadataOutput() throws Exception {
220220
String json = getParamOutContent("--json", "--digest=MD2", resourcePrefix + "testJsonMultipleInts.html");
221221
//TIKA-1310
222-
assertTrue(json.contains("\"" + "fb:admins\":\"1,2,3,4\","));
222+
assertTrue(json.contains("\"html_meta:fb:admins\":\"1,2,3,4\","));
223223
assertTrue(json.contains("\"X-TIKA:digest:MD2\":"));
224224
}
225225

‎tika-core/src/main/java/org/apache/tika/metadata/HTML.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,14 @@
1616
*/
1717

1818
public interface HTML {
19-
String PREFIX_HTML_META = "html_meta";
19+
String PREFIX_HTML_META = "html_meta" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
2020

2121

2222
/**
2323
* If a script element contains a src value, this value
2424
* is set in the embedded document's metadata
2525
*/
2626
Property SCRIPT_SOURCE = Property.internalText(
27-
PREFIX_HTML_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "scriptSrc");
27+
PREFIX_HTML_META + "scriptSrc");
2828

2929
}

‎tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java

+4-3
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ public void startElement(String uri, String local, String name, Attributes atts)
143143
addHtmlMetadata(atts.getValue("name"), atts.getValue("content"));
144144
} else if (atts.getValue("property") != null) {
145145
// TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
146-
metadata.add(atts.getValue("property"), atts.getValue("content"));
146+
metadata.add(HTML.PREFIX_HTML_META + atts.getValue("property"), atts.getValue("content"));
147147
}
148148
} else if ("BASE".equals(name) && atts.getValue("href") != null) {
149149
startElementWithSafeAttributes("base", atts);
@@ -222,14 +222,15 @@ private void addHtmlMetadata(String name, String value) {
222222
if (property.equals(TikaCoreProperties.TITLE) && isTitleSetToMetadata) {
223223
//prefer the title element if it is already set
224224
//do nothing
225+
metadata.add(HTML.PREFIX_HTML_META + TikaCoreProperties.TITLE.getName(), value);
225226
} else if (property.isMultiValuePermitted()) {
226227
metadata.add(property, value);
227228
} else {
228229
metadata.set(property, value);
229230
}
231+
} else {
232+
metadata.add(HTML.PREFIX_HTML_META + name, value);
230233
}
231-
//TODO -- we should prefix these raw names to avoid collisions
232-
metadata.add(name, value);
233234
}
234235

235236
private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {

‎tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

+8-11
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
import org.apache.tika.exception.TikaException;
7171
import org.apache.tika.io.TikaInputStream;
7272
import org.apache.tika.metadata.Geographic;
73+
import org.apache.tika.metadata.HTML;
7374
import org.apache.tika.metadata.Metadata;
7475
import org.apache.tika.metadata.Office;
7576
import org.apache.tika.metadata.TikaCoreProperties;
@@ -109,8 +110,8 @@ public void startElement(String u, String l, String n, Attributes a)
109110
}
110111

111112
assertEquals("Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
112-
assertEquals("Tika Developers", metadata.get("Author"));
113-
assertEquals("5", metadata.get("refresh"));
113+
assertEquals("Tika Developers", metadata.get(TikaCoreProperties.CREATOR));
114+
assertEquals("5", metadata.get(HTML.PREFIX_HTML_META + "refresh"));
114115

115116
assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
116117
assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
@@ -152,8 +153,8 @@ public void testXhtmlParsing() throws Exception {
152153
metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset="));
153154
assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));
154155

155-
assertEquals("Tika Developers", metadata.get("Author"));
156-
assertEquals("5", metadata.get("refresh"));
156+
assertEquals("Tika Developers", metadata.get(TikaCoreProperties.CREATOR));
157+
assertEquals("5", metadata.get(HTML.PREFIX_HTML_META + "refresh"));
157158
assertContains("ability of Apache Tika", content);
158159
assertContains("extract content", content);
159160
assertContains("an XHTML document", content);
@@ -809,8 +810,8 @@ public void testOpenGraphMetadata() throws Exception {
809810
Metadata metadata = new Metadata();
810811
new JSoupParser().parse(new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
811812
new BodyContentHandler(), metadata, new ParseContext());
812-
assertEquals("some description", metadata.get("og:description"));
813-
assertTrue(metadata.isMultiValued("og:image"));
813+
assertEquals("some description", metadata.get(HTML.PREFIX_HTML_META + "og:description"));
814+
assertTrue(metadata.isMultiValued(HTML.PREFIX_HTML_META + "og:image"));
814815
}
815816

816817
// TIKA-1011
@@ -1220,19 +1221,15 @@ public void testMetadataMapping() throws Exception {
12201221
List<Metadata> metadataList = getRecursiveMetadata("testHTML_metadata.html");
12211222
Metadata m = metadataList.get(0);
12221223
assertEquals("Free Web tutorials", m.get(TikaCoreProperties.DESCRIPTION));
1223-
assertEquals("Free Web tutorials", m.get("description"));
12241224

12251225
assertEquals("HTML,CSS,XML,JavaScript", m.get(TikaCoreProperties.SUBJECT));
1226-
assertEquals("HTML,CSS,XML,JavaScript", m.get("keywords"));
12271226

12281227
assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
12291228
assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
12301229

12311230
assertEquals("OldMetaTitle", m.get(TikaCoreProperties.TITLE));
1232-
assertEquals("OldMetaTitle", m.get("title"));
12331231

12341232
assertEquals("John Doe", m.get(TikaCoreProperties.CREATOR));
1235-
assertEquals("John Doe", m.get("author"));
12361233
}
12371234

12381235
@Test
@@ -1242,7 +1239,7 @@ public void testPreferenceForTitleElement() throws Exception {
12421239
Metadata m = metadataList.get(0);
12431240

12441241
assertEquals("ActualTitle", m.get(TikaCoreProperties.TITLE));
1245-
assertEquals("OldMetaTitle", m.get("title"));
1242+
assertEquals("OldMetaTitle", m.get(HTML.PREFIX_HTML_META + TikaCoreProperties.TITLE.getName()));
12461243
}
12471244

12481245
@Test

‎tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java

-2
Original file line numberDiff line numberDiff line change
@@ -630,8 +630,6 @@ private void extractMetadata(PDDocument document, Metadata metadata, ParseContex
630630
for (COSName key : info.getCOSObject().keySet()) {
631631
String name = key.getName();
632632
if (!handledMetadata.contains(name)) {
633-
PDMetadataExtractor
634-
.addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
635633
PDMetadataExtractor.addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
636634
info.getCOSObject().getDictionaryObject(key));
637635
}

‎tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -150,20 +150,20 @@ public void testPdfParsingMetadataOnly() throws Exception {
150150
}
151151

152152
@Test
153-
public void testCustomMetadata() throws Exception {
153+
public void testCustomMetadataInPDDocInfo() throws Exception {
154154

155155
XMLResult r = getXML("testPDF-custommetadata.pdf");
156156
Metadata metadata = r.metadata;
157157
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
158158
assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR));
159159
assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));
160160

161-
assertEquals("Custom Value", metadata.get("Custom Property"));
161+
assertEquals("Custom Value", metadata.get(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Property"));
162162

163-
assertEquals("Array Entry 1", metadata.get("Custom Array"));
164-
assertEquals(2, metadata.getValues("Custom Array").length);
165-
assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
166-
assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
163+
assertEquals("Array Entry 1", metadata.get(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array"));
164+
assertEquals(2, metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array").length);
165+
assertEquals("Array Entry 1", metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array")[0]);
166+
assertEquals("Array Entry 2", metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array")[1]);
167167

168168
assertContains("Hello World!", r.xml);
169169
}

0 commit comments

Comments
 (0)