@@ -20,7 +20,7 @@ class smscalls(user_config):
2020
2121from datetime import datetime , timezone
2222from pathlib import Path
23- from typing import NamedTuple , Iterator , Set , Tuple , Optional
23+ from typing import NamedTuple , Iterator , Set , Tuple , Optional , Any , Dict , List
2424
2525from lxml import etree
2626
@@ -150,6 +150,165 @@ def _extract_messages(path: Path) -> Iterator[Res[Message]]:
150150 )
151151
152152
153+ class MMSContentPart (NamedTuple ):
154+ sequence_index : int
155+ content_type : str
156+ filename : str
157+ text : Optional [str ]
158+ data : Optional [str ]
159+
160+
161+ class MMS (NamedTuple ):
162+ dt : datetime
163+ dt_readable : str
164+ parts : List [MMSContentPart ]
165+ # NOTE: these is often something like 'Name 1, Name 2', but might be different depending on your client
166+ who : Optional [str ]
167+ # NOTE: This can be a single phone number, or multiple, split by '~' or ','. Its better to think
168+ # of this as a 'key' or 'conversation ID', phone numbers are also present in 'addresses'
169+ phone_number : str
170+ addresses : List [Tuple [str , int ]]
171+ # 1 = Received, 2 = Sent, 3 = Draft, 4 = Outbox
172+ message_type : int
173+
174+ @property
175+ def from_user (self ) -> str :
176+ # since these can be group messages, we can't just check message_type,
177+ # we need to iterate through and find who sent it
178+ # who is CC/'To' is not obvious in many message clients
179+ #
180+ # 129 = BCC, 130 = CC, 151 = To, 137 = From
181+ for (addr , _type ) in self .addresses :
182+ if _type == 137 :
183+ return addr
184+ else :
185+ # hmm, maybe return instead? but this probably shouldnt happen, means
186+ # something is very broken
187+ raise RuntimeError (f'No from address matching 137 found in { self .addresses } ' )
188+
189+ @property
190+ def from_me (self ) -> bool :
191+ return self .message_type == 2
192+
193+
194+ def mms () -> Iterator [Res [MMS ]]:
195+ files = get_files (config .export_path , glob = 'sms-*.xml' )
196+
197+ emitted : Set [Tuple [datetime , Optional [str ], str ]] = set ()
198+ for p in files :
199+ for c in _extract_mms (p ):
200+ if isinstance (c , Exception ):
201+ yield c
202+ continue
203+ key = (c .dt , c .phone_number , c .from_user )
204+ if key in emitted :
205+ continue
206+ emitted .add (key )
207+ yield c
208+
209+
210+ def _resolve_null_str (value : Optional [str ]) -> Optional [str ]:
211+ if value is None :
212+ return None
213+ # hmm.. theres some risk of the text actually being 'null', but theres
214+ # no way to distinguish that from XML values
215+ if value == 'null' :
216+ return None
217+ return value
218+
219+
220+ def _extract_mms (path : Path ) -> Iterator [Res [MMS ]]:
221+ tr = etree .parse (str (path ))
222+
223+ for mxml in tr .findall ('mms' ):
224+ dt = mxml .get ('date' )
225+ dt_readable = mxml .get ('readable_date' )
226+ message_type = mxml .get ('msg_box' )
227+
228+ who = mxml .get ('contact_name' )
229+ if who is not None and who in UNKNOWN :
230+ who = None
231+ phone_number = mxml .get ('address' )
232+
233+ if dt is None or dt_readable is None or message_type is None or phone_number is None :
234+ mxml_str = etree .tostring (mxml ).decode ('utf-8' )
235+ yield RuntimeError (f'Missing one or more required attributes [date, readable_date, msg_box, address] in { mxml_str } ' )
236+ continue
237+
238+ addresses : List [Tuple [str , int ]] = []
239+ for addr_parent in mxml .findall ('addrs' ):
240+ for addr in addr_parent .findall ('addr' ):
241+ addr_data = addr .attrib
242+ user_address = addr_data .get ('address' )
243+ user_type = addr_data .get ('type' )
244+ if user_address is None or user_type is None :
245+ addr_str = etree .tostring (addr_parent ).decode ()
246+ yield RuntimeError (f'Missing one or more required attributes [address, type] in { addr_str } ' )
247+ continue
248+ if not user_type .isdigit ():
249+ yield RuntimeError (f'Invalid address type { user_type } { type (user_type )} , cannot convert to number' )
250+ continue
251+ addresses .append ((user_address , int (user_type )))
252+
253+ content : List [MMSContentPart ] = []
254+
255+ for part_root in mxml .findall ('parts' ):
256+
257+ for part in part_root .findall ('part' ):
258+
259+ # the first item is an SMIL XML element encoded as a string which describes
260+ # how the rest of the parts are laid out
261+ # https://www.w3.org/TR/SMIL3/smil-timing.html#Timing-TimeContainerSyntax
262+ # An example:
263+ # <smil><head><layout><root-layout/><region id="Text" top="0" left="0" height="100%" width="100%"/></layout></head><body><par dur="5000ms"><text src="text.000000.txt" region="Text" /></par></body></smil>
264+ #
265+ # This seems pretty useless, so we should try and skip it, and just return the
266+ # text/images/data
267+ #
268+ # man, attrib is some internal cpython ._Attrib type which can't
269+ # be typed by any sort of mappingproxy. maybe a protocol could work..?
270+ part_data : Dict [str , Any ] = part .attrib # type: ignore
271+ seq : Optional [str ] = part_data .get ('seq' )
272+ if seq == '-1' :
273+ continue
274+
275+ if seq is None or not seq .isdigit ():
276+ yield RuntimeError (f'seq must be a number, was seq={ seq } { type (seq )} in { part_data } ' )
277+ continue
278+
279+ charset_type : Optional [str ] = _resolve_null_str (part_data .get ('ct' ))
280+ filename : Optional [str ] = _resolve_null_str (part_data .get ('name' ))
281+ # in some cases (images, cards), the filename is set in 'cl' instead
282+ if filename is None :
283+ filename = _resolve_null_str (part_data .get ('cl' ))
284+ text : Optional [str ] = _resolve_null_str (part_data .get ('text' ))
285+ data : Optional [str ] = _resolve_null_str (part_data .get ('data' ))
286+
287+ if charset_type is None or filename is None or (text is None and data is None ):
288+ yield RuntimeError (f'Missing one or more required attributes [ct, name, (text, data)] must be present in { part_data } ' )
289+ continue
290+
291+ content .append (
292+ MMSContentPart (
293+ sequence_index = int (seq ),
294+ content_type = charset_type ,
295+ filename = filename ,
296+ text = text ,
297+ data = data
298+ )
299+ )
300+
301+ yield MMS (
302+ dt = _parse_dt_ms (dt ),
303+ dt_readable = dt_readable ,
304+ who = who ,
305+ phone_number = phone_number ,
306+ message_type = int (message_type ),
307+ parts = content ,
308+ addresses = addresses ,
309+ )
310+
311+
153312# See https://github.com/karlicoss/HPI/pull/90#issuecomment-702422351
154313# for potentially parsing timezone from the readable_date
155314def _parse_dt_ms (d : str ) -> datetime :
@@ -162,4 +321,5 @@ def stats() -> Stats:
162321 return {
163322 ** stat (calls ),
164323 ** stat (messages ),
324+ ** stat (mms ),
165325 }
0 commit comments