Skip to content

Commit

Permalink
added trawler script
Browse files Browse the repository at this point in the history
  • Loading branch information
ericwhyne committed Sep 25, 2015
1 parent 09dd40e commit b6ea65e
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 84 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
*~
*#
twitter_oauth_settings.py
data/
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,24 @@
trawler
# trawler
=======

## Getting started
````bash
mkdir ~/.trawler/
cp example_token_file.yaml ~/.trawler/default.yaml
vim ~/.trawler/default.yaml
````
Place your twitter API tokens in ~/.trawler/default.yaml
````bash
./trawler -h
./trawler -sn -sn example_screen_names.txt
````

## Notes

### Useful scripts
The scripts starting with the word save demonstrate various other functionality.

### Rate Limits
Most of the interesting functionality is in the class
RateLimitedTwitterEndpoint. The class is a wrapper around the (Twython
wrapper around the) Twitter API that handles all of the details of
Expand Down
1 change: 1 addition & 0 deletions example_screen_names.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@ericwhyne
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
chromium-compact-language-detector
twython>=3.1.2
pyyaml
75 changes: 0 additions & 75 deletions save_by_screename.py

This file was deleted.

107 changes: 107 additions & 0 deletions trawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python
"""
This script downloads Tweets for a given list of usernames and their FF networks to a specified depth.
Instantiate with -h option to view help info.
"""

# Standard Library modules
import argparse
import codecs
import os
import sys
import yaml
import datetime
import collections

import pprint

# Third party modules
from twython import Twython, TwythonError

# Local modules
from twitter_crawler import (CrawlTwitterTimelines, RateLimitedTwitterEndpoint, FindFriendFollowers,
get_console_info_logger, get_screen_names_from_file, save_tweets_to_json_file,
save_screen_names_to_file)

ff_scanned_screen_names = [] # global to avoid geting ff for same person twice
def get_ff(screen_names, depth, ff_finder, logger):
if depth == 0:
return screen_names
else:
next_level_sns = []
for screen_name in screen_names:
if screen_name not in ff_scanned_screen_names: # don't get ff for same person twice
print "Print! Getting ff for %s" % screen_name
logger.info("Getting ff for %s" % screen_name)
ff_scanned_screen_names.append(screen_name)
next_level_sns += ff_finder.get_ff_screen_names_for_screen_name(screen_name)
return get_ff(screen_names + next_level_sns, depth-1, ff_finder, logger) # recursion

def main():
# Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
sys.stdout = codecs.getwriter('utf8')(sys.stdout)

# Parse and document command line options
parser = argparse.ArgumentParser(description="")
parser.add_argument('-sn', dest='screen_name_file', default="example_screen_names.txt",
help='A text file with one screen name per line.')
parser.add_argument('-t', dest='token_file', default=os.path.expanduser("~") + "/.trawler/default.yaml",
help='A configuration file with Twitter API access tokens. See example_token_file.yaml.')
parser.add_argument('-d', dest='depth', default=0,
help='Friend and follower depth. A value of 1 will gather all tweets for users \
in the file as well as all tweets from their friends and followers. Default is 0.')
args = parser.parse_args()

# Set up loggers and output directory
logger = get_console_info_logger()
output_directory = "data/" + datetime.datetime.now().isoformat() + "/"
try:
if not os.path.exists(output_directory):
os.makedirs(output_directory)
except:
print "Could not create directory:", directory
exit(0)
logger.info("Created directory: %s" % output_directory)

# Set up API access
tokens = yaml.safe_load(open(args.token_file))
ACCESS_TOKEN = Twython(tokens['consumer_key'], tokens['consumer_secret'], oauth_version=2).obtain_access_token()
twython = Twython(tokens['consumer_key'], access_token=ACCESS_TOKEN)
crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger)

# Gather unique screen names
screen_names = get_screen_names_from_file(args.screen_name_file)
depth = int(args.depth) # todo, validate args.depth
unique_screen_names = []
if depth > 0: # don't initiate ff_finder unless we have to
ff_finder = FindFriendFollowers(twython, logger)
ff_screen_names = get_ff(screen_names, depth, ff_finder, logger)
unique_screen_names = set(ff_screen_names)
else:
unique_screen_names = set(screen_names) # assume the list has redundant names
save_screen_names_to_file(unique_screen_names, output_directory + 'screen_names')

# Gather tweets for each of the unique screen names
for screen_name in unique_screen_names:
tweet_filename = output_directory + screen_name + ".tweets"
if os.path.exists(tweet_filename):
logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name))
else:
try:
logger.info("Retrieving Tweets for user " + screen_name + " writing to file " + tweet_filename)
tweets = crawler.get_data(screen_name=screen_name, count=200)
except TwythonError as e:
print "TwythonError: %s" % e
if e.error_code == 404:
logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name)
elif e.error_code == 401:
logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name)
else:
# Unhandled exception
raise e
else:
save_tweets_to_json_file(tweets, tweet_filename)


if __name__ == "__main__":
main()
16 changes: 8 additions & 8 deletions twitter_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def get_all_timeline_tweets_for_screen_name_since(self, screen_name, since_id,ma

# Retrieve rest of Tweets
while 1:
if tweets: #Will only trigger
if tweets: #Will only trigger
max_id = int(tweets[-1]['id']) - 1
more_tweets = self._twitter_endpoint.get_data(screen_name=screen_name, count=200, max_id=max_id, since_id=since_id)
tweets += more_tweets
Expand Down Expand Up @@ -303,7 +303,7 @@ def get_all_timeline_tweets_for_id_since(self, user_id, since_id,max_id=None):

# Retrieve rest of Tweets
while 1:
if tweets: #Will only trigger
if tweets: #Will only trigger
max_id = int(tweets[-1]['id']) - 1
more_tweets = self._twitter_endpoint.get_data(user_id=user_id, count=200, max_id=max_id, since_id=since_id)
tweets += more_tweets
Expand Down Expand Up @@ -428,7 +428,7 @@ def get_ff_screen_names_for_screen_name(self, screen_name):
### Accessing data by user_id
###

def get_ff_ids_for_screen_name(self, user_id):
def get_ff_ids_for_id(self, user_id):
"""
Returns Twitter user IDs for users who are both Friends and Followers
for the specified `user_id`.
Expand Down Expand Up @@ -710,8 +710,8 @@ def api_calls_remaining(self):
def lookup_users(self, twitter_ids):
"""
Returns the user lookup for the users specified by `twitter_id`
To maximize throughput of Twitter API, this looks up 100 users with a
single call.
To maximize throughput of Twitter API, this looks up 100 users with a
single call.
"""
# The Twitter API allows us to look up info for 100 users at a time
amassed_users = []
Expand Down Expand Up @@ -867,7 +867,7 @@ def _get_data_with_backoff(self, backoff, **twitter_api_parameters):
return self._twython.get(self._twitter_api_endpoint, params=twitter_api_parameters)
except TwythonError as e:
self._logger.error("TwythonError: %s" % e)

# Twitter error codes:
# https://dev.twitter.com/docs/error-codes-responses

Expand All @@ -878,7 +878,7 @@ def _get_data_with_backoff(self, backoff, **twitter_api_parameters):
time.sleep(backoff)
self._update_rate_limit_status()
return self._get_data_with_backoff(backoff*2, **twitter_api_parameters)
# Sleep if Twitter servers are misbehaving
# Sleep if Twitter servers are misbehaving
elif e.error_code in [502, 503, 504]:
self._logger.error("Twitter servers are misbehaving - sleeping for %d seconds" % backoff)
time.sleep(backoff)
Expand All @@ -887,7 +887,7 @@ def _get_data_with_backoff(self, backoff, **twitter_api_parameters):
elif "Caused by <class 'httplib.BadStatusLine'>: ''" in str(e):
# Twitter servers can sometimes return an empty HTTP response, e.g.:
# https://dev.twitter.com/discussions/20832
#
#
# The code currently detects empty HTTPS responses by checking for a particular
# string:
# Caused by <class 'httplib.BadStatusLine'>: ''"
Expand Down

0 comments on commit b6ea65e

Please sign in to comment.