Skip to content

Commit 3806e55

Browse files
authored
TIKA-4360 -- improve extraction of mapi metadata (#2073)
1 parent 1e51784 commit 3806e55

File tree

7 files changed

+252
-106
lines changed

7 files changed

+252
-106
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.metadata;
18+
19+
/**
20+
* Office Document properties collection. These properties apply to
21+
* Office / Productivity Documents of all forms, including (but not limited
22+
* to) MS Office and OpenDocument formats.
23+
* This is a logical collection of properties, which may be drawn from a
24+
* few different external definitions.
25+
*
26+
* @since Apache Tika 1.2
27+
*/
28+
public interface MAPI {
29+
30+
String PREFIX_MAPI_META = "mapi" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
31+
32+
/**
33+
* MAPI message class. What type of .msg/MAPI file is it?
34+
*/
35+
Property MESSAGE_CLASS =
36+
Property.internalClosedChoise(PREFIX_MAPI_META + "message-class", "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE",
37+
"POST", "TASK", "UNKNOWN", "UNSPECIFIED");
38+
39+
Property SENT_BY_SERVER_TYPE = Property.internalText(PREFIX_MAPI_META + "sent-by-server-type");
40+
41+
Property FROM_REPRESENTING_NAME = Property.internalText(PREFIX_MAPI_META + "from-representing-name");
42+
43+
Property FROM_REPRESENTING_EMAIL = Property.internalText(PREFIX_MAPI_META + "from-representing-email");
44+
45+
Property SUBMISSION_ACCEPTED_AT_TIME = Property.internalDate(PREFIX_MAPI_META + "msg-submission-accepted-at-time");
46+
47+
Property SUBMISSION_ID = Property.internalText(PREFIX_MAPI_META + "msg-submission-id");
48+
49+
Property INTERNET_MESSAGE_ID = Property.internalText(PREFIX_MAPI_META + "internet-message-id");
50+
51+
Property INTERNET_REFERENCES = Property.internalTextBag(PREFIX_MAPI_META + "internet-references");
52+
53+
54+
Property CONVERSATION_TOPIC = Property.internalText(PREFIX_MAPI_META + "conversation-topic");
55+
56+
Property CONVERSATION_INDEX = Property.internalText(PREFIX_MAPI_META + "conversation-index");
57+
Property IN_REPLY_TO_ID = Property.internalText(PREFIX_MAPI_META + "in-reply-to-id");
58+
59+
Property RECIPIENTS_STRING = Property.internalText(PREFIX_MAPI_META + "recipients-string");
60+
Property IMPORTANCE = Property.internalInteger(PREFIX_MAPI_META + "importance");
61+
Property PRIORTY = Property.internalInteger(PREFIX_MAPI_META + "priority");
62+
Property IS_FLAGGED = Property.internalBoolean(PREFIX_MAPI_META + "is-flagged");
63+
}

tika-core/src/main/java/org/apache/tika/metadata/Office.java

-32
Original file line numberDiff line numberDiff line change
@@ -145,43 +145,11 @@ public interface Office {
145145
Property OBJECT_COUNT = Property.internalInteger(
146146
PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "object-count");
147147

148-
/**
149-
* MAPI message class. What type of .msg/MAPI file is it?
150-
*/
151-
Property MAPI_MESSAGE_CLASS = Property.internalClosedChoise(
152-
PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-message-class",
153-
"APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK", "UNKNOWN",
154-
"UNSPECIFIED");
155-
156-
Property MAPI_SENT_BY_SERVER_TYPE = Property.internalText(
157-
PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
158-
"mapi-sent-by-server-type");
159-
160-
Property MAPI_FROM_REPRESENTING_NAME = Property.internalText(
161-
PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
162-
"mapi-from-representing-name");
163-
164-
Property MAPI_FROM_REPRESENTING_EMAIL = Property.internalText(
165-
PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
166-
"mapi-from-representing-email");
167-
168-
Property MAPI_MESSAGE_CLIENT_SUBMIT_TIME = Property.internalDate(
169-
PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
170-
"mapi-msg-client-submit-time");
171-
172148
/**
173149
* Embedded files may have a "progID" associated with them, such as
174150
* Word.Document.12 or AcroExch.Document.DC
175151
*/
176152
Property PROG_ID = Property.internalText("msoffice:progID");
177153

178154
Property OCX_NAME = Property.internalText("msoffice:ocxName");
179-
Property MAPI_RECIPIENTS_STRING = Property.internalText(PREFIX_DOC_META +
180-
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-recipients-string");
181-
Property MAPI_IMPORTANCE = Property.internalInteger(PREFIX_DOC_META +
182-
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance");
183-
Property MAPI_PRIORTY = Property.internalInteger(PREFIX_DOC_META +
184-
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance");
185-
Property MAPI_IS_FLAGGED = Property.internalBoolean(PREFIX_DOC_META +
186-
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-is-flagged");
187155
}

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java

+145-45
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@
2424
import java.nio.charset.IllegalCharsetNameException;
2525
import java.nio.charset.UnsupportedCharsetException;
2626
import java.util.ArrayList;
27+
import java.util.Calendar;
2728
import java.util.Collections;
2829
import java.util.Date;
30+
import java.util.HashMap;
2931
import java.util.LinkedHashMap;
3032
import java.util.LinkedList;
3133
import java.util.List;
@@ -34,6 +36,7 @@
3436
import java.util.regex.Matcher;
3537
import java.util.regex.Pattern;
3638

39+
import org.apache.commons.codec.binary.Hex;
3740
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
3841
import org.apache.james.mime4j.codec.DecodeMonitor;
3942
import org.apache.james.mime4j.codec.DecoderUtil;
@@ -44,6 +47,7 @@
4447
import org.apache.poi.hsmf.datatypes.Chunk;
4548
import org.apache.poi.hsmf.datatypes.Chunks;
4649
import org.apache.poi.hsmf.datatypes.MAPIProperty;
50+
import org.apache.poi.hsmf.datatypes.MessageSubmissionChunk;
4751
import org.apache.poi.hsmf.datatypes.PropertyValue;
4852
import org.apache.poi.hsmf.datatypes.RecipientChunks;
4953
import org.apache.poi.hsmf.datatypes.StringChunk;
@@ -56,9 +60,9 @@
5660
import org.apache.tika.exception.TikaException;
5761
import org.apache.tika.extractor.EmbeddedDocumentUtil;
5862
import org.apache.tika.io.TikaInputStream;
63+
import org.apache.tika.metadata.MAPI;
5964
import org.apache.tika.metadata.Message;
6065
import org.apache.tika.metadata.Metadata;
61-
import org.apache.tika.metadata.Office;
6266
import org.apache.tika.metadata.Property;
6367
import org.apache.tika.metadata.TikaCoreProperties;
6468
import org.apache.tika.mime.MediaType;
@@ -80,17 +84,48 @@
8084
public class OutlookExtractor extends AbstractPOIFSExtractor {
8185

8286
private static final Metadata EMPTY_METADATA = new Metadata();
83-
84-
private static Pattern HEADER_KEY_PAT =
85-
Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");
86-
87-
private final MAPIMessage msg;
87+
private static final MAPIProperty[] LITERAL_TIME_MAPI_PROPERTIES = new MAPIProperty[] {
88+
MAPIProperty.CLIENT_SUBMIT_TIME,
89+
MAPIProperty.CREATION_TIME,
90+
MAPIProperty.DEFERRED_DELIVERY_TIME,
91+
MAPIProperty.DELIVER_TIME,
92+
//EXPAND BEGIN and EXPAND END?
93+
MAPIProperty.EXPIRY_TIME,
94+
MAPIProperty.LAST_MODIFICATION_TIME,
95+
MAPIProperty.LATEST_DELIVERY_TIME,
96+
MAPIProperty.MESSAGE_DELIVERY_TIME,
97+
MAPIProperty.MESSAGE_DOWNLOAD_TIME,
98+
MAPIProperty.ORIGINAL_DELIVERY_TIME,
99+
MAPIProperty.ORIGINAL_SUBMIT_TIME,
100+
MAPIProperty.PROVIDER_SUBMIT_TIME,
101+
MAPIProperty.RECEIPT_TIME,
102+
MAPIProperty.REPLY_TIME,
103+
MAPIProperty.REPORT_TIME
104+
105+
};
106+
107+
private static final Map<MAPIProperty, Property> LITERAL_TIME_PROPERTIES = new HashMap<>();
108+
109+
static {
110+
for (MAPIProperty property : LITERAL_TIME_MAPI_PROPERTIES) {
111+
String name = property.mapiProperty.toLowerCase(Locale.ROOT);
112+
name = name.substring(3);
113+
name = name.replace('_', '-');
114+
name = MAPI.PREFIX_MAPI_META + name;
115+
Property tikaProp = Property.internalDate(name);
116+
LITERAL_TIME_PROPERTIES.put(property, tikaProp);
117+
}
118+
}
88119
//this according to the spec; in practice, it is probably more likely
89120
//that a "split field" fails to start with a space character than
90121
//that a real header contains anything but [-_A-Za-z0-9].
91122
//e.g.
92123
//header: this header goes onto the next line
93124
//<mailto:[email protected]...
125+
private static Pattern HEADER_KEY_PAT =
126+
Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");
127+
128+
private final MAPIMessage msg;
94129
private final ParseContext parseContext;
95130
private final boolean extractAllAlternatives;
96131
HtmlEncodingDetector detector = new HtmlEncodingDetector();
@@ -158,7 +193,7 @@ public void parse(XHTMLContentHandler xhtml)
158193
msg.setReturnNullOnMissingChunk(true);
159194

160195
try {
161-
parentMetadata.set(Office.MAPI_MESSAGE_CLASS, msg.getMessageClassEnum().name());
196+
parentMetadata.set(MAPI.MESSAGE_CLASS, msg.getMessageClassEnum().name());
162197
} catch (ChunkNotFoundException e) {
163198
//swallow
164199
}
@@ -170,15 +205,10 @@ public void parse(XHTMLContentHandler xhtml)
170205
}
171206

172207
// Start with the metadata
173-
String subject = msg.getSubject();
174208
Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
175-
String from = msg.getDisplayFrom();
176209

177210
handleFromTo(headers, parentMetadata);
178-
179-
parentMetadata.set(TikaCoreProperties.TITLE, subject);
180-
parentMetadata.set(TikaCoreProperties.SUBJECT, msg.getConversationTopic());
181-
parentMetadata.set(TikaCoreProperties.DESCRIPTION, msg.getConversationTopic());
211+
handleMessageInfo(msg, headers, parentMetadata);
182212

183213
try {
184214
for (String recipientAddress : msg.getRecipientEmailAddressList()) {
@@ -197,35 +227,7 @@ public void parse(XHTMLContentHandler xhtml)
197227
}
198228
}
199229

200-
// Date - try two ways to find it
201-
// First try via the proper chunk
202-
if (msg.getMessageDate() != null) {
203-
parentMetadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
204-
parentMetadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
205-
} else {
206-
if (headers != null && headers.size() > 0) {
207-
for (Map.Entry<String, String[]> header : headers.entrySet()) {
208-
String headerKey = header.getKey();
209-
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
210-
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
211-
212-
// See if we can parse it as a normal mail date
213-
try {
214-
Date d = MailDateParser.parseDateLenient(date);
215-
parentMetadata.set(TikaCoreProperties.CREATED, d);
216-
parentMetadata.set(TikaCoreProperties.MODIFIED, d);
217-
} catch (SecurityException e ) {
218-
throw e;
219-
} catch (Exception e) {
220-
// Store it as-is, and hope for the best...
221-
parentMetadata.set(TikaCoreProperties.CREATED, date);
222-
parentMetadata.set(TikaCoreProperties.MODIFIED, date);
223-
}
224-
break;
225-
}
226-
}
227-
}
228-
}
230+
handleGeneralDates(msg, headers, parentMetadata);
229231

230232
// Get the message body. Preference order is: html, rtf, text
231233
Chunk htmlChunk = null;
@@ -277,6 +279,104 @@ public void parse(XHTMLContentHandler xhtml)
277279
}
278280
}
279281

282+
private void handleMessageInfo(MAPIMessage msg, Map<String, String[]> headers, Metadata metadata)
283+
throws ChunkNotFoundException {
284+
//this is the literal subject including "re: "
285+
metadata.set(TikaCoreProperties.TITLE, msg.getSubject());
286+
//this is the original topic for the thread without the "re: "
287+
String topic = msg.getConversationTopic();
288+
metadata.set(TikaCoreProperties.SUBJECT, topic);
289+
metadata.set(TikaCoreProperties.DESCRIPTION, topic);
290+
metadata.set(MAPI.CONVERSATION_TOPIC, topic);
291+
Chunks mainChunks = msg.getMainChunks();
292+
if (mainChunks != null) {
293+
if (mainChunks.getMessageId() != null) {
294+
metadata.set(MAPI.INTERNET_MESSAGE_ID, mainChunks
295+
.getMessageId()
296+
.getValue());
297+
}
298+
299+
List<Chunk> conversationIndex = mainChunks.getAll().get(MAPIProperty.CONVERSATION_INDEX);
300+
if (conversationIndex != null && ! conversationIndex.isEmpty()) {
301+
Chunk chunk = conversationIndex.get(0);
302+
if (chunk instanceof ByteChunk) {
303+
byte[] bytes = ((ByteChunk)chunk).getValue();
304+
String hex = Hex.encodeHexString(bytes);
305+
metadata.set(MAPI.CONVERSATION_INDEX, hex);
306+
}
307+
}
308+
309+
List<Chunk> internetReferences = mainChunks.getAll().get(MAPIProperty.INTERNET_REFERENCES);
310+
if (internetReferences != null) {
311+
for (Chunk ref : internetReferences) {
312+
if (ref instanceof StringChunk) {
313+
metadata.add(MAPI.INTERNET_REFERENCES, ((StringChunk) ref).getValue());
314+
}
315+
}
316+
}
317+
List<Chunk> inReplyToIds = mainChunks.getAll().get(MAPIProperty.IN_REPLY_TO_ID);
318+
if (inReplyToIds != null && ! inReplyToIds.isEmpty()) {
319+
metadata.add(MAPI.IN_REPLY_TO_ID, inReplyToIds.get(0).toString());
320+
}
321+
322+
for (Map.Entry<MAPIProperty, Property> e : LITERAL_TIME_PROPERTIES.entrySet()) {
323+
List<PropertyValue> timeProp = mainChunks.getProperties().get(e.getKey());
324+
if (timeProp != null && ! timeProp.isEmpty()) {
325+
Calendar cal = ((PropertyValue.TimePropertyValue)timeProp.get(0)).getValue();
326+
metadata.set(e.getValue(), cal);
327+
}
328+
}
329+
330+
MessageSubmissionChunk messageSubmissionChunk = mainChunks.getSubmissionChunk();
331+
if (messageSubmissionChunk != null) {
332+
String submissionId = messageSubmissionChunk.getSubmissionId();
333+
metadata.set(MAPI.SUBMISSION_ID, submissionId);
334+
metadata.set(MAPI.SUBMISSION_ACCEPTED_AT_TIME, messageSubmissionChunk.getAcceptedAtTime());
335+
}
336+
337+
}
338+
}
339+
340+
341+
private void handleGeneralDates(MAPIMessage msg, Map<String, String[]> headers, Metadata metadata) throws ChunkNotFoundException {
342+
// Date - try two ways to find it
343+
// First try via the proper chunk
344+
if (msg.getMessageDate() != null) {
345+
metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
346+
metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
347+
} else {
348+
if (headers != null && headers.size() > 0) {
349+
for (Map.Entry<String, String[]> header : headers.entrySet()) {
350+
String headerKey = header.getKey();
351+
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
352+
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
353+
354+
// See if we can parse it as a normal mail date
355+
try {
356+
Date d = MailDateParser.parseDateLenient(date);
357+
metadata.set(TikaCoreProperties.CREATED, d);
358+
metadata.set(TikaCoreProperties.MODIFIED, d);
359+
} catch (SecurityException e ) {
360+
throw e;
361+
} catch (Exception e) {
362+
// Store it as-is, and hope for the best...
363+
metadata.set(TikaCoreProperties.CREATED, date);
364+
metadata.set(TikaCoreProperties.MODIFIED, date);
365+
}
366+
break;
367+
}
368+
}
369+
}
370+
}
371+
//try to overwrite the modified property if the actual LAST_MODIFICATION_TIME property exists.
372+
List<PropertyValue> timeProp = msg.getMainChunks().getProperties().get(MAPIProperty.LAST_MODIFICATION_TIME);
373+
if (timeProp != null && ! timeProp.isEmpty()) {
374+
Calendar cal = ((PropertyValue.TimePropertyValue)timeProp.get(0)).getValue();
375+
metadata.set(TikaCoreProperties.MODIFIED, cal);
376+
}
377+
378+
}
379+
280380
private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
281381
XHTMLContentHandler xhtml)
282382
throws SAXException, IOException, TikaException {
@@ -395,7 +495,7 @@ private void handleFromTo(Map<String, String[]> headers, Metadata metadata)
395495
Chunks chunks = msg.getMainChunks();
396496
StringChunk sentByServerType = chunks.getSentByServerType();
397497
if (sentByServerType != null) {
398-
metadata.set(Office.MAPI_SENT_BY_SERVER_TYPE, sentByServerType.getValue());
498+
metadata.set(MAPI.SENT_BY_SERVER_TYPE, sentByServerType.getValue());
399499
}
400500

401501
Map<MAPIProperty, List<Chunk>> mainChunks = msg.getMainChunks().getAll();
@@ -411,12 +511,12 @@ private void handleFromTo(Map<String, String[]> headers, Metadata metadata)
411511
setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), Message.MESSAGE_FROM_NAME,
412512
metadata);
413513
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME),
414-
Office.MAPI_FROM_REPRESENTING_NAME, metadata);
514+
MAPI.FROM_REPRESENTING_NAME, metadata);
415515

416516
setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), Message.MESSAGE_FROM_EMAIL,
417517
metadata);
418518
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS),
419-
Office.MAPI_FROM_REPRESENTING_EMAIL, metadata);
519+
MAPI.FROM_REPRESENTING_EMAIL, metadata);
420520

421521
for (Recipient recipient : buildRecipients()) {
422522
switch (recipient.recipientType) {

0 commit comments

Comments
 (0)