Skip to content

Commit 35dd5d8

Browse files
authored
smscalls: parse mms from smscalls export (#370)
* initial mms exploration
1 parent 8a8a1eb commit 35dd5d8

File tree

2 files changed

+163
-2
lines changed

2 files changed

+163
-2
lines changed

my/smscalls.py

Lines changed: 161 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class smscalls(user_config):
2020

2121
from datetime import datetime, timezone
2222
from pathlib import Path
23-
from typing import NamedTuple, Iterator, Set, Tuple, Optional
23+
from typing import NamedTuple, Iterator, Set, Tuple, Optional, Any, Dict, List
2424

2525
from lxml import etree
2626

@@ -150,6 +150,165 @@ def _extract_messages(path: Path) -> Iterator[Res[Message]]:
150150
)
151151

152152

153+
class MMSContentPart(NamedTuple):
154+
sequence_index: int
155+
content_type: str
156+
filename: str
157+
text: Optional[str]
158+
data: Optional[str]
159+
160+
161+
class MMS(NamedTuple):
162+
dt: datetime
163+
dt_readable: str
164+
parts: List[MMSContentPart]
165+
# NOTE: these is often something like 'Name 1, Name 2', but might be different depending on your client
166+
who: Optional[str]
167+
# NOTE: This can be a single phone number, or multiple, split by '~' or ','. Its better to think
168+
# of this as a 'key' or 'conversation ID', phone numbers are also present in 'addresses'
169+
phone_number: str
170+
addresses: List[Tuple[str, int]]
171+
# 1 = Received, 2 = Sent, 3 = Draft, 4 = Outbox
172+
message_type: int
173+
174+
@property
175+
def from_user(self) -> str:
176+
# since these can be group messages, we can't just check message_type,
177+
# we need to iterate through and find who sent it
178+
# who is CC/'To' is not obvious in many message clients
179+
#
180+
# 129 = BCC, 130 = CC, 151 = To, 137 = From
181+
for (addr, _type) in self.addresses:
182+
if _type == 137:
183+
return addr
184+
else:
185+
# hmm, maybe return instead? but this probably shouldnt happen, means
186+
# something is very broken
187+
raise RuntimeError(f'No from address matching 137 found in {self.addresses}')
188+
189+
@property
190+
def from_me(self) -> bool:
191+
return self.message_type == 2
192+
193+
194+
def mms() -> Iterator[Res[MMS]]:
195+
files = get_files(config.export_path, glob='sms-*.xml')
196+
197+
emitted: Set[Tuple[datetime, Optional[str], str]] = set()
198+
for p in files:
199+
for c in _extract_mms(p):
200+
if isinstance(c, Exception):
201+
yield c
202+
continue
203+
key = (c.dt, c.phone_number, c.from_user)
204+
if key in emitted:
205+
continue
206+
emitted.add(key)
207+
yield c
208+
209+
210+
def _resolve_null_str(value: Optional[str]) -> Optional[str]:
211+
if value is None:
212+
return None
213+
# hmm.. theres some risk of the text actually being 'null', but theres
214+
# no way to distinguish that from XML values
215+
if value == 'null':
216+
return None
217+
return value
218+
219+
220+
def _extract_mms(path: Path) -> Iterator[Res[MMS]]:
221+
tr = etree.parse(str(path))
222+
223+
for mxml in tr.findall('mms'):
224+
dt = mxml.get('date')
225+
dt_readable = mxml.get('readable_date')
226+
message_type = mxml.get('msg_box')
227+
228+
who = mxml.get('contact_name')
229+
if who is not None and who in UNKNOWN:
230+
who = None
231+
phone_number = mxml.get('address')
232+
233+
if dt is None or dt_readable is None or message_type is None or phone_number is None:
234+
mxml_str = etree.tostring(mxml).decode('utf-8')
235+
yield RuntimeError(f'Missing one or more required attributes [date, readable_date, msg_box, address] in {mxml_str}')
236+
continue
237+
238+
addresses: List[Tuple[str, int]] = []
239+
for addr_parent in mxml.findall('addrs'):
240+
for addr in addr_parent.findall('addr'):
241+
addr_data = addr.attrib
242+
user_address = addr_data.get('address')
243+
user_type = addr_data.get('type')
244+
if user_address is None or user_type is None:
245+
addr_str = etree.tostring(addr_parent).decode()
246+
yield RuntimeError(f'Missing one or more required attributes [address, type] in {addr_str}')
247+
continue
248+
if not user_type.isdigit():
249+
yield RuntimeError(f'Invalid address type {user_type} {type(user_type)}, cannot convert to number')
250+
continue
251+
addresses.append((user_address, int(user_type)))
252+
253+
content: List[MMSContentPart] = []
254+
255+
for part_root in mxml.findall('parts'):
256+
257+
for part in part_root.findall('part'):
258+
259+
# the first item is an SMIL XML element encoded as a string which describes
260+
# how the rest of the parts are laid out
261+
# https://www.w3.org/TR/SMIL3/smil-timing.html#Timing-TimeContainerSyntax
262+
# An example:
263+
# <smil><head><layout><root-layout/><region id="Text" top="0" left="0" height="100%" width="100%"/></layout></head><body><par dur="5000ms"><text src="text.000000.txt" region="Text" /></par></body></smil>
264+
#
265+
# This seems pretty useless, so we should try and skip it, and just return the
266+
# text/images/data
267+
#
268+
# man, attrib is some internal cpython ._Attrib type which can't
269+
# be typed by any sort of mappingproxy. maybe a protocol could work..?
270+
part_data: Dict[str, Any] = part.attrib # type: ignore
271+
seq: Optional[str] = part_data.get('seq')
272+
if seq == '-1':
273+
continue
274+
275+
if seq is None or not seq.isdigit():
276+
yield RuntimeError(f'seq must be a number, was seq={seq} {type(seq)} in {part_data}')
277+
continue
278+
279+
charset_type: Optional[str] = _resolve_null_str(part_data.get('ct'))
280+
filename: Optional[str] = _resolve_null_str(part_data.get('name'))
281+
# in some cases (images, cards), the filename is set in 'cl' instead
282+
if filename is None:
283+
filename = _resolve_null_str(part_data.get('cl'))
284+
text: Optional[str] = _resolve_null_str(part_data.get('text'))
285+
data: Optional[str] = _resolve_null_str(part_data.get('data'))
286+
287+
if charset_type is None or filename is None or (text is None and data is None):
288+
yield RuntimeError(f'Missing one or more required attributes [ct, name, (text, data)] must be present in {part_data}')
289+
continue
290+
291+
content.append(
292+
MMSContentPart(
293+
sequence_index=int(seq),
294+
content_type=charset_type,
295+
filename=filename,
296+
text=text,
297+
data=data
298+
)
299+
)
300+
301+
yield MMS(
302+
dt=_parse_dt_ms(dt),
303+
dt_readable=dt_readable,
304+
who=who,
305+
phone_number=phone_number,
306+
message_type=int(message_type),
307+
parts=content,
308+
addresses=addresses,
309+
)
310+
311+
153312
# See https://github.com/karlicoss/HPI/pull/90#issuecomment-702422351
154313
# for potentially parsing timezone from the readable_date
155314
def _parse_dt_ms(d: str) -> datetime:
@@ -162,4 +321,5 @@ def stats() -> Stats:
162321
return {
163322
**stat(calls),
164323
**stat(messages),
324+
**stat(mms),
165325
}

tests/smscalls.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
# TODO implement via stat?
66
def test() -> None:
7-
from my.smscalls import calls, messages
7+
from my.smscalls import calls, messages, mms
88
assert len(list(calls())) > 10
99
assert len(list(messages())) > 10
10+
assert len(list(mms())) > 10

0 commit comments

Comments
 (0)