Skip to content

Commit

Permalink
A bit messy, but now dealing with IDs previously seen to be 404d or 401d
Browse files Browse the repository at this point in the history
  • Loading branch information
Coppersmith committed Sep 18, 2015
1 parent e95b8c5 commit a6f26ef
Showing 1 changed file with 30 additions and 0 deletions.
30 changes: 30 additions & 0 deletions save_timelines_to_json_gz.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,44 @@ def main():
parser = argparse.ArgumentParser(description="")
parser.add_argument('id_file')
parser.add_argument('output_loc')
parser.add_argument('--token_file',dest='token_file',default=None)
args = parser.parse_args()

logger = get_console_info_logger()

#Optionally pass as a parameter
#There has to be a more elegant way to combine this with the default behavior -- tomorrow's problem though
oauth_settings_file_loc = args.token_file
if oauth_settings_file_loc:
print "Using tokens from:", oauth_settings_file_loc
exec(open(oauth_settings_file_loc).read())

ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token()
twython = Twython(consumer_key, access_token=ACCESS_TOKEN)
crawler = CrawlTwitterTimelines(twython, logger)

twitter_ids = get_screen_names_from_file(args.id_file)
twitter_ids.reverse() #HARDCODE
output_loc = args.output_loc

tempfile_loc = 'tmp/'
os.system('mkdir -p '+tempfile_loc)

#load previously broken ID files so we don't try to read them again
broken_ids = set([]) #Defaults to an empty set
try:
broken_ids = set([long(x).strip() for x in open(tempfile_loc + '404d').readlines()])
except:
pass
try:
broken_ids = broken_ids.union(set([long(x).strip() for x in open(tempfile_loc + '401d').readlines()]))
except:
pass

for twitter_id in twitter_ids:
if twitter_id in broken_ids:
print '%s was previously inaccessible, not trying to download.' % twitter_id
continue
tweet_filename = output_loc + "%s.tweets.gz" % twitter_id
if os.path.exists(tweet_filename):
logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, twitter_id))
Expand All @@ -64,8 +90,12 @@ def main():
print "TwythonError: %s" % e
if e.error_code == 404:
logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % twitter_id)
with open(tempfile_loc + '404d','a') as OUT:
OUT.write('%s\n' % twitter_id)
elif e.error_code == 401:
logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % twitter_id)
with open(tempfile_loc + '401d','a') as OUT:
OUT.write('%s\n' % twitter_id)
else:
# Unhandled exception
print e
Expand Down

0 comments on commit a6f26ef

Please sign in to comment.