24
24
import java .nio .charset .IllegalCharsetNameException ;
25
25
import java .nio .charset .UnsupportedCharsetException ;
26
26
import java .util .ArrayList ;
27
+ import java .util .Calendar ;
27
28
import java .util .Collections ;
28
29
import java .util .Date ;
30
+ import java .util .HashMap ;
29
31
import java .util .LinkedHashMap ;
30
32
import java .util .LinkedList ;
31
33
import java .util .List ;
34
36
import java .util .regex .Matcher ;
35
37
import java .util .regex .Pattern ;
36
38
39
+ import org .apache .commons .codec .binary .Hex ;
37
40
import org .apache .commons .io .input .UnsynchronizedByteArrayInputStream ;
38
41
import org .apache .james .mime4j .codec .DecodeMonitor ;
39
42
import org .apache .james .mime4j .codec .DecoderUtil ;
44
47
import org .apache .poi .hsmf .datatypes .Chunk ;
45
48
import org .apache .poi .hsmf .datatypes .Chunks ;
46
49
import org .apache .poi .hsmf .datatypes .MAPIProperty ;
50
+ import org .apache .poi .hsmf .datatypes .MessageSubmissionChunk ;
47
51
import org .apache .poi .hsmf .datatypes .PropertyValue ;
48
52
import org .apache .poi .hsmf .datatypes .RecipientChunks ;
49
53
import org .apache .poi .hsmf .datatypes .StringChunk ;
56
60
import org .apache .tika .exception .TikaException ;
57
61
import org .apache .tika .extractor .EmbeddedDocumentUtil ;
58
62
import org .apache .tika .io .TikaInputStream ;
63
+ import org .apache .tika .metadata .MAPI ;
59
64
import org .apache .tika .metadata .Message ;
60
65
import org .apache .tika .metadata .Metadata ;
61
- import org .apache .tika .metadata .Office ;
62
66
import org .apache .tika .metadata .Property ;
63
67
import org .apache .tika .metadata .TikaCoreProperties ;
64
68
import org .apache .tika .mime .MediaType ;
80
84
public class OutlookExtractor extends AbstractPOIFSExtractor {
81
85
82
86
private static final Metadata EMPTY_METADATA = new Metadata ();
83
-
84
- private static Pattern HEADER_KEY_PAT =
85
- Pattern .compile ("\\ A([\\ x21-\\ x39\\ x3B-\\ x7E]+):(.*?)\\ Z" );
86
-
87
- private final MAPIMessage msg ;
87
+ private static final MAPIProperty [] LITERAL_TIME_MAPI_PROPERTIES = new MAPIProperty [] {
88
+ MAPIProperty .CLIENT_SUBMIT_TIME ,
89
+ MAPIProperty .CREATION_TIME ,
90
+ MAPIProperty .DEFERRED_DELIVERY_TIME ,
91
+ MAPIProperty .DELIVER_TIME ,
92
+ //EXPAND BEGIN and EXPAND END?
93
+ MAPIProperty .EXPIRY_TIME ,
94
+ MAPIProperty .LAST_MODIFICATION_TIME ,
95
+ MAPIProperty .LATEST_DELIVERY_TIME ,
96
+ MAPIProperty .MESSAGE_DELIVERY_TIME ,
97
+ MAPIProperty .MESSAGE_DOWNLOAD_TIME ,
98
+ MAPIProperty .ORIGINAL_DELIVERY_TIME ,
99
+ MAPIProperty .ORIGINAL_SUBMIT_TIME ,
100
+ MAPIProperty .PROVIDER_SUBMIT_TIME ,
101
+ MAPIProperty .RECEIPT_TIME ,
102
+ MAPIProperty .REPLY_TIME ,
103
+ MAPIProperty .REPORT_TIME
104
+
105
+ };
106
+
107
+ private static final Map <MAPIProperty , Property > LITERAL_TIME_PROPERTIES = new HashMap <>();
108
+
109
+ static {
110
+ for (MAPIProperty property : LITERAL_TIME_MAPI_PROPERTIES ) {
111
+ String name = property .mapiProperty .toLowerCase (Locale .ROOT );
112
+ name = name .substring (3 );
113
+ name = name .replace ('_' , '-' );
114
+ name = MAPI .PREFIX_MAPI_META + name ;
115
+ Property tikaProp = Property .internalDate (name );
116
+ LITERAL_TIME_PROPERTIES .put (property , tikaProp );
117
+ }
118
+ }
88
119
//this according to the spec; in practice, it is probably more likely
89
120
//that a "split field" fails to start with a space character than
90
121
//that a real header contains anything but [-_A-Za-z0-9].
91
122
//e.g.
92
123
//header: this header goes onto the next line
93
124
125
+ private static Pattern HEADER_KEY_PAT =
126
+ Pattern .compile ("\\ A([\\ x21-\\ x39\\ x3B-\\ x7E]+):(.*?)\\ Z" );
127
+
128
+ private final MAPIMessage msg ;
94
129
private final ParseContext parseContext ;
95
130
private final boolean extractAllAlternatives ;
96
131
HtmlEncodingDetector detector = new HtmlEncodingDetector ();
@@ -158,7 +193,7 @@ public void parse(XHTMLContentHandler xhtml)
158
193
msg .setReturnNullOnMissingChunk (true );
159
194
160
195
try {
161
- parentMetadata .set (Office . MAPI_MESSAGE_CLASS , msg .getMessageClassEnum ().name ());
196
+ parentMetadata .set (MAPI . MESSAGE_CLASS , msg .getMessageClassEnum ().name ());
162
197
} catch (ChunkNotFoundException e ) {
163
198
//swallow
164
199
}
@@ -170,15 +205,10 @@ public void parse(XHTMLContentHandler xhtml)
170
205
}
171
206
172
207
// Start with the metadata
173
- String subject = msg .getSubject ();
174
208
Map <String , String []> headers = normalizeHeaders (msg .getHeaders ());
175
- String from = msg .getDisplayFrom ();
176
209
177
210
handleFromTo (headers , parentMetadata );
178
-
179
- parentMetadata .set (TikaCoreProperties .TITLE , subject );
180
- parentMetadata .set (TikaCoreProperties .SUBJECT , msg .getConversationTopic ());
181
- parentMetadata .set (TikaCoreProperties .DESCRIPTION , msg .getConversationTopic ());
211
+ handleMessageInfo (msg , headers , parentMetadata );
182
212
183
213
try {
184
214
for (String recipientAddress : msg .getRecipientEmailAddressList ()) {
@@ -197,35 +227,7 @@ public void parse(XHTMLContentHandler xhtml)
197
227
}
198
228
}
199
229
200
- // Date - try two ways to find it
201
- // First try via the proper chunk
202
- if (msg .getMessageDate () != null ) {
203
- parentMetadata .set (TikaCoreProperties .CREATED , msg .getMessageDate ().getTime ());
204
- parentMetadata .set (TikaCoreProperties .MODIFIED , msg .getMessageDate ().getTime ());
205
- } else {
206
- if (headers != null && headers .size () > 0 ) {
207
- for (Map .Entry <String , String []> header : headers .entrySet ()) {
208
- String headerKey = header .getKey ();
209
- if (headerKey .toLowerCase (Locale .ROOT ).startsWith ("date:" )) {
210
- String date = headerKey .substring (headerKey .indexOf (':' ) + 1 ).trim ();
211
-
212
- // See if we can parse it as a normal mail date
213
- try {
214
- Date d = MailDateParser .parseDateLenient (date );
215
- parentMetadata .set (TikaCoreProperties .CREATED , d );
216
- parentMetadata .set (TikaCoreProperties .MODIFIED , d );
217
- } catch (SecurityException e ) {
218
- throw e ;
219
- } catch (Exception e ) {
220
- // Store it as-is, and hope for the best...
221
- parentMetadata .set (TikaCoreProperties .CREATED , date );
222
- parentMetadata .set (TikaCoreProperties .MODIFIED , date );
223
- }
224
- break ;
225
- }
226
- }
227
- }
228
- }
230
+ handleGeneralDates (msg , headers , parentMetadata );
229
231
230
232
// Get the message body. Preference order is: html, rtf, text
231
233
Chunk htmlChunk = null ;
@@ -277,6 +279,104 @@ public void parse(XHTMLContentHandler xhtml)
277
279
}
278
280
}
279
281
282
+ private void handleMessageInfo (MAPIMessage msg , Map <String , String []> headers , Metadata metadata )
283
+ throws ChunkNotFoundException {
284
+ //this is the literal subject including "re: "
285
+ metadata .set (TikaCoreProperties .TITLE , msg .getSubject ());
286
+ //this is the original topic for the thread without the "re: "
287
+ String topic = msg .getConversationTopic ();
288
+ metadata .set (TikaCoreProperties .SUBJECT , topic );
289
+ metadata .set (TikaCoreProperties .DESCRIPTION , topic );
290
+ metadata .set (MAPI .CONVERSATION_TOPIC , topic );
291
+ Chunks mainChunks = msg .getMainChunks ();
292
+ if (mainChunks != null ) {
293
+ if (mainChunks .getMessageId () != null ) {
294
+ metadata .set (MAPI .INTERNET_MESSAGE_ID , mainChunks
295
+ .getMessageId ()
296
+ .getValue ());
297
+ }
298
+
299
+ List <Chunk > conversationIndex = mainChunks .getAll ().get (MAPIProperty .CONVERSATION_INDEX );
300
+ if (conversationIndex != null && ! conversationIndex .isEmpty ()) {
301
+ Chunk chunk = conversationIndex .get (0 );
302
+ if (chunk instanceof ByteChunk ) {
303
+ byte [] bytes = ((ByteChunk )chunk ).getValue ();
304
+ String hex = Hex .encodeHexString (bytes );
305
+ metadata .set (MAPI .CONVERSATION_INDEX , hex );
306
+ }
307
+ }
308
+
309
+ List <Chunk > internetReferences = mainChunks .getAll ().get (MAPIProperty .INTERNET_REFERENCES );
310
+ if (internetReferences != null ) {
311
+ for (Chunk ref : internetReferences ) {
312
+ if (ref instanceof StringChunk ) {
313
+ metadata .add (MAPI .INTERNET_REFERENCES , ((StringChunk ) ref ).getValue ());
314
+ }
315
+ }
316
+ }
317
+ List <Chunk > inReplyToIds = mainChunks .getAll ().get (MAPIProperty .IN_REPLY_TO_ID );
318
+ if (inReplyToIds != null && ! inReplyToIds .isEmpty ()) {
319
+ metadata .add (MAPI .IN_REPLY_TO_ID , inReplyToIds .get (0 ).toString ());
320
+ }
321
+
322
+ for (Map .Entry <MAPIProperty , Property > e : LITERAL_TIME_PROPERTIES .entrySet ()) {
323
+ List <PropertyValue > timeProp = mainChunks .getProperties ().get (e .getKey ());
324
+ if (timeProp != null && ! timeProp .isEmpty ()) {
325
+ Calendar cal = ((PropertyValue .TimePropertyValue )timeProp .get (0 )).getValue ();
326
+ metadata .set (e .getValue (), cal );
327
+ }
328
+ }
329
+
330
+ MessageSubmissionChunk messageSubmissionChunk = mainChunks .getSubmissionChunk ();
331
+ if (messageSubmissionChunk != null ) {
332
+ String submissionId = messageSubmissionChunk .getSubmissionId ();
333
+ metadata .set (MAPI .SUBMISSION_ID , submissionId );
334
+ metadata .set (MAPI .SUBMISSION_ACCEPTED_AT_TIME , messageSubmissionChunk .getAcceptedAtTime ());
335
+ }
336
+
337
+ }
338
+ }
339
+
340
+
341
+ private void handleGeneralDates (MAPIMessage msg , Map <String , String []> headers , Metadata metadata ) throws ChunkNotFoundException {
342
+ // Date - try two ways to find it
343
+ // First try via the proper chunk
344
+ if (msg .getMessageDate () != null ) {
345
+ metadata .set (TikaCoreProperties .CREATED , msg .getMessageDate ().getTime ());
346
+ metadata .set (TikaCoreProperties .MODIFIED , msg .getMessageDate ().getTime ());
347
+ } else {
348
+ if (headers != null && headers .size () > 0 ) {
349
+ for (Map .Entry <String , String []> header : headers .entrySet ()) {
350
+ String headerKey = header .getKey ();
351
+ if (headerKey .toLowerCase (Locale .ROOT ).startsWith ("date:" )) {
352
+ String date = headerKey .substring (headerKey .indexOf (':' ) + 1 ).trim ();
353
+
354
+ // See if we can parse it as a normal mail date
355
+ try {
356
+ Date d = MailDateParser .parseDateLenient (date );
357
+ metadata .set (TikaCoreProperties .CREATED , d );
358
+ metadata .set (TikaCoreProperties .MODIFIED , d );
359
+ } catch (SecurityException e ) {
360
+ throw e ;
361
+ } catch (Exception e ) {
362
+ // Store it as-is, and hope for the best...
363
+ metadata .set (TikaCoreProperties .CREATED , date );
364
+ metadata .set (TikaCoreProperties .MODIFIED , date );
365
+ }
366
+ break ;
367
+ }
368
+ }
369
+ }
370
+ }
371
+ //try to overwrite the modified property if the actual LAST_MODIFICATION_TIME property exists.
372
+ List <PropertyValue > timeProp = msg .getMainChunks ().getProperties ().get (MAPIProperty .LAST_MODIFICATION_TIME );
373
+ if (timeProp != null && ! timeProp .isEmpty ()) {
374
+ Calendar cal = ((PropertyValue .TimePropertyValue )timeProp .get (0 )).getValue ();
375
+ metadata .set (TikaCoreProperties .MODIFIED , cal );
376
+ }
377
+
378
+ }
379
+
280
380
private void handleBodyChunks (Chunk htmlChunk , Chunk rtfChunk , Chunk textChunk ,
281
381
XHTMLContentHandler xhtml )
282
382
throws SAXException , IOException , TikaException {
@@ -395,7 +495,7 @@ private void handleFromTo(Map<String, String[]> headers, Metadata metadata)
395
495
Chunks chunks = msg .getMainChunks ();
396
496
StringChunk sentByServerType = chunks .getSentByServerType ();
397
497
if (sentByServerType != null ) {
398
- metadata .set (Office . MAPI_SENT_BY_SERVER_TYPE , sentByServerType .getValue ());
498
+ metadata .set (MAPI . SENT_BY_SERVER_TYPE , sentByServerType .getValue ());
399
499
}
400
500
401
501
Map <MAPIProperty , List <Chunk >> mainChunks = msg .getMainChunks ().getAll ();
@@ -411,12 +511,12 @@ private void handleFromTo(Map<String, String[]> headers, Metadata metadata)
411
511
setFirstChunk (mainChunks .get (MAPIProperty .SENDER_NAME ), Message .MESSAGE_FROM_NAME ,
412
512
metadata );
413
513
setFirstChunk (mainChunks .get (MAPIProperty .SENT_REPRESENTING_NAME ),
414
- Office . MAPI_FROM_REPRESENTING_NAME , metadata );
514
+ MAPI . FROM_REPRESENTING_NAME , metadata );
415
515
416
516
setFirstChunk (mainChunks .get (MAPIProperty .SENDER_EMAIL_ADDRESS ), Message .MESSAGE_FROM_EMAIL ,
417
517
metadata );
418
518
setFirstChunk (mainChunks .get (MAPIProperty .SENT_REPRESENTING_EMAIL_ADDRESS ),
419
- Office . MAPI_FROM_REPRESENTING_EMAIL , metadata );
519
+ MAPI . FROM_REPRESENTING_EMAIL , metadata );
420
520
421
521
for (Recipient recipient : buildRecipients ()) {
422
522
switch (recipient .recipientType ) {
0 commit comments