diff --git a/README.md b/README.md index 6509584..41176eb 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,18 @@ -voice2json +`voice2json` ============= - *Export your Google Voice call & text history as JSON* Google Takeout only offers HTML export of your Google Voice call and text message records. `voice2json` converts these HTML files into machine-readable JSON for easier analysis. ## Usage - *Note: Beautiful Soup 4 is a required dependency for HTML parsing. Install it using* `pip install beautifulsoup4`*.* First, download your Google Voice call history as HTML from [Google Takeout](https://www.google.com/settings/takeout). Unzip the file and navigate to the `Voice/Calls` directory. Then run the `voice2json.py`, passing in the path to the calls directory, and you're all done: - ``` $ python voice2json.py /path/to/Takeout/Voice/Calls /path/to/output.json ``` Slightly more detailed: - ``` $ python voice2json.py -h usage: voice2json.py [-h] source [output] @@ -30,72 +26,67 @@ optional arguments: ``` ## Output - `voice2json` produces a JSON file in the following format: ```js { - "records": [ - - // Calls, voicemails, etc... - { - "date": "2013-01-27T04:21:24.000Z", // ISO-formatted date - "duration": 56000.0, // If applicable, the duration of the call - "tags": [ // Array of the tags(s) associated with the record (see below) - "received", - ... - ], - "contributors": [ // An array of participants in the call - { - "tel": "+01234567890", // Stringified telephone number containing country code - "name": "John Doe" // Name (if known, otherwise an empty string) - }, - ... - ] - }, - - // Text messages - { - "date": "2014-05-07T19:26:51.780Z", // ISO-formatted start date of conversation - "conversation": [ // A list of messages in the conversation - { - "date": "2014-05-07T19:26:51.780Z", // ISO-formatted date of message - "message": "I'm right behind you.", - "sender": { - "tel": "+01234567890", // Stringified telephone number containing country code - "name": "John Doe" // Name (if known, otherwise an empty string) - } - }, - ... - ], - "contributors": [ - { - "tel": "+01234567890", // Stringified telephone number containing country code - "name": "John Doe" // Name (if known, otherwise an empty string) - }, - { - "tel": "+01234567890", // Your phone number - "name": "Me" - } - ], - "tags": [ // A list of tags for the conversation - "inbox", - "sms" - ] - } - - ... + "records": [ + // Calls, voicemails, etc... + { + "date": "2013-01-27T04:21:24.000Z", // ISO-formatted date + "duration": 56000.0, // If applicable, the duration of the call + "tags": [ // Array of the tags(s) associated with the record (see below) + "received", + ... + ], + "contributors": [ // An array of participants in the call + { + "tel": "+01234567890", // Stringified telephone number containing country code + "name": "John Doe" // Name (if known, otherwise an empty string) + }, + ... + ] + }, + // Text messages + { + "date": "2014-05-07T19:26:51.780Z", // ISO-formatted start date of conversation + "conversation": [ // A list of messages in the conversation + { + "date": "2014-05-07T19:26:51.780Z", // ISO-formatted date of message + "message": "I'm right behind you.", + "sender": { + "tel": "+01234567890", // Stringified telephone number containing country code + "name": "John Doe" // Name (if known, otherwise an empty string) + } + }, + ... + ], + "contributors": [ + { + "tel": "+01234567890", // Stringified telephone number containing country code + "name": "John Doe" // Name (if known, otherwise an empty string) + }, + { + "tel": "+01234567890", // Your phone number + "name": "Me" + } + ], + "tags": [ // A list of tags for the conversation + "inbox", + "sms" + ] + } + ... } ``` Possible tags include: - - - **received** – an incoming call, received and answered - - **missed** – an incoming call that was not answered - - **placed** – an outgoing call - - **inbox** – an item in the Google Voice inbox (e.g., a voicemail) - - **voicemail** – a voicemail message - - **unread** – used in conjunction with a voicemail to indicate that it is unread - - **sms** – a text message + - **received** — an incoming call, received and answered + - **missed** — an incoming call that was not answered + - **placed** — an outgoing call + - **inbox** — an item in the Google Voice inbox (e.g., a voicemail) + - **voicemail** — a voicemail message + - **unread** — used in conjunction with a voicemail to indicate that it is unread + - **sms** — a text message Happy analyzing! diff --git a/voice2json.py b/voice2json.py index 360f388..7c1102c 100644 --- a/voice2json.py +++ b/voice2json.py @@ -1,158 +1,130 @@ -import os -import re -import io -import sys -import glob -import json -import argparse -import datetime +import os, re, io, sys, glob, json, argparse, datetime from operator import itemgetter - from bs4 import BeautifulSoup - +# DURATION_RE = r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?' - - +# def convert_to_type(s): - return s.replace('http://www.google.com/voice#', '') - - + return s.replace('http://www.google.com/voice#', '') +# def convert_to_tel(s): - return s.replace('tel:', '') - - + return s.replace('tel:', '') +# def convert_to_duration(s): - r = re.search(DURATION_RE, s) - td = datetime.timedelta(hours=int(r.group(1) or 0), - minutes=int(r.group(2) or 0), - seconds=int(r.group(3) or 0)) - return td.total_seconds() * 1000 - - + r = re.search(DURATION_RE, s) + td = datetime.timedelta( + hours = int(r.group(1) or 0), + minutes = int(r.group(2) or 0), + seconds = int(r.group(3) or 0)) + return td.total_seconds() * 1000 +# def serialize_general_to_record(raw): - soup = BeautifulSoup(raw, 'html.parser') - - contributors = [] - for contributor in soup.find_all('div', class_='contributor'): - contributors.append({ - 'name': contributor.find('span', class_='fn').string or '', - 'tel': convert_to_tel(contributor.find('a', class_='tel')['href']) - }) - - record = { - 'tags': [convert_to_type(a['href']) for a in - soup.find_all('a', rel='tag')], - 'date': soup.find('abbr', class_='published')['title'], - 'contributors': contributors - } - - if soup.find('abbr', class_='duration') is not None: - record['duration'] = convert_to_duration( - soup.find('abbr', class_='duration')['title']) - - return record - - + soup = BeautifulSoup(raw, 'html.parser') + contributors = [] + for contributor in soup.find_all('div', class_ = 'contributor'): + contributors.append({ + 'name': contributor.find('span', class_ = 'fn').string or '', + 'tel' : convert_to_tel(contributor.find('a', class_ = 'tel')['href']) + }) + record = { + 'tags': [convert_to_type(a['href']) for a in + soup.find_all('a', rel = 'tag')], + 'date': soup.find('abbr', class_ = 'published')['title'], + 'contributors': contributors + } + if soup.find('abbr', class_ = 'duration') is not None: + record['duration'] = convert_to_duration( + soup.find('abbr', class_ = 'duration')['title']) + return record +# def serialize_text_messages_to_record(raw): - soup = BeautifulSoup(raw, 'html.parser') - - sender = [] - messages = [] - dates = [] - conversation = [] - contributors = [] - - for contributor in soup.find_all('cite', class_='sender'): - # Messages from others are in the "span" tag and messages from you - # are in the "abbr" tag - if contributor.find('span', class_='fn'): - sender.append({ - 'name': contributor.find('span', class_='fn').string or '', - 'tel': convert_to_tel( - contributor.find('a', class_='tel')['href']) - }) - if contributor.find('abbr', class_='fn'): - sender.append({ - 'name': contributor.find('abbr', class_='fn').string or '', - 'tel': convert_to_tel( - contributor.find('a', class_='tel')['href']) - }) - - for message in soup.find_all('q'): - messages.append(message.text) - - for date in soup.find_all('abbr', class_='dt'): - dates.append(date['title']) - - for item in sender: - if item not in contributors: - contributors.append(item) - - # A message where the other side didn't respond. - # Tel is not given and will have to map later :/ - if len(contributors) == 1 and contributors[0]['name'] == 'Me': - title = soup.find('title').text.split('\n')[-1] - if '+' in title: - contributors.append({ - 'name': title, - 'tel': title - }) - else: - contributors.append({ - 'name': title, - 'tel': '' - }) - - for i in range(0, len(messages)): - conversation.append({ - 'sender': sender[i], - 'message': messages[i], - 'date': dates[i] - }) - - record = { - 'date': dates[0], - 'contributors': contributors, - 'conversation': conversation, - 'tags': [convert_to_type(a['href']) for a in - soup.find_all('a', rel='tag')] - } - - return record - - + soup = BeautifulSoup(raw, 'html.parser') + # + sender = [] + messages = [] + dates = [] + conversation = [] + contributors = [] + # + for contributor in soup.find_all('cite', class_ = 'sender'): + # Messages from others are in the "span" tag and messages from you + # are in the "abbr" tag + if contributor.find('span', class_ = 'fn'): + sender.append({ + 'name': contributor.find('span', class_ = 'fn').string or '', + 'tel' : convert_to_tel( + contributor.find('a', class_ = 'tel')['href']) + }) + if contributor.find('abbr', class_ = 'fn'): + sender.append({ + 'name': contributor.find('abbr', class_ = 'fn').string or '', + 'tel' : convert_to_tel( + contributor.find('a', class_ = 'tel')['href']) + }) + for message in soup.find_all('q'): + messages.append(message.text) + for date in soup.find_all('abbr', class_ = 'dt'): + dates.append(date['title']) + for item in sender: + if item not in contributors: + contributors.append(item) + # A message where the other side didn't respond. + # Tel is not given and will have to map later :/ + if len(contributors) == 1 and contributors[0]['name'] == 'Me': + title = soup.find('title').text.split('\n')[-1] + if '+' in title: + contributors.append({ + 'name': title, + 'tel' : title + }) + else: + contributors.append({ + 'name': title, + 'tel' : '' + }) + for i in range(0, len(messages)): + conversation.append({ + 'sender' : sender[i], + 'message': messages[i], + 'date' : dates[i] + }) + record = { + 'date': dates[0], + 'contributors': contributors, + 'conversation': conversation, + 'tags': [convert_to_type(a['href']) for a in + soup.find_all('a', rel='tag')] + } + return record +# def serialize_files_to_json(paths): - records = [] - for path in paths: - with io.open(path, 'r', encoding='utf8') as f: - if 'Text' in path: - serialized = serialize_text_messages_to_record(f.read()) - records.append(serialized) - else: - serialized = serialize_general_to_record(f.read()) - records.append(serialized) - - records.sort(key=itemgetter('date')) - return json.dumps({'records': records}, indent=4) - - + records = [] + for path in paths: + with io.open(path, 'r', encoding='utf8') as f: + if 'Text' in path: + serialized = serialize_text_messages_to_record(f.read()) + records.append(serialized) + else: + serialized = serialize_general_to_record(f.read()) + records.append(serialized) + records.sort(key = itemgetter('date')) + return json.dumps({'records': records}, indent = 4) +# def main(): - parser = argparse.ArgumentParser() - parser.add_argument('source', - help='Directory of call & text HTML files to convert') - parser.add_argument('output', - nargs='?', - type=argparse.FileType('w'), - default=sys.stdout, - help='Where to write JSON output (default: stdout)') - args = parser.parse_args() - - files = glob.glob(os.path.join(args.source, '*.html')) - - json = serialize_files_to_json(files) - - with args.output as f: - f.write(json) - + parser = argparse.ArgumentParser() + parser.add_argument( + 'source', + help='Directory of call & text HTML files to convert') + parser.add_argument( + 'output', + nargs = '?', + type = argparse.FileType('w'), + default = sys.stdout, + help ='Where to write JSON output (default: stdout)') + args = parser.parse_args() + files = glob.glob(os.path.join(args.source, '*.html')) + json = serialize_files_to_json(files) + with args.output as f: + f.write(json) if __name__ == '__main__': - main() + main()