-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsample.py
More file actions
78 lines (61 loc) · 2.97 KB
/
sample.py
File metadata and controls
78 lines (61 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# This is used to generate random sample tweets and save them in a JSON file
import json
import csv
import twarc
import tweepy
import config
MAX_USER_PER_LANG = 10_000
FIELD_NAMES = ['author_id', 'lang', 'rt_ratio']
FILE_NAME = "outputs/authors_langs_ratio.csv"
class Sampler(tweepy.StreamingClient):
def __init__(self, writer, bearer_token):
# Debug counter
# self.counter = 0
self.num_match = {'fr': 6684, 'en': 10000, 'tr': 10000}#{"fr": 0, "en": 0, "tr": 0}
""" Field represents the number of current author_ids per language that have been scanned """
self.ids: dict[int, str] = {}
""" Dictionnary that contains the author_id as keys and the pair of their language """
self.writer = writer
tweepy.StreamingClient.__init__(self, bearer_token=bearer_token)
def check_lang(self, tweet: tweepy.Tweet) -> bool:
"""
Checks if the given tweet has a language field set to English, French or Turkish
and checks if the maximum number of users per language is not reached
"""
lang: str = tweet.lang
correct_lang = lang is not None and (lang == "fr" or lang == "en" or lang == "tr")
if correct_lang and self.num_match[lang] < MAX_USER_PER_LANG:
# Increment the number of users in dictionary
self.num_match[lang] = self.num_match[lang] + 1
return True
else:
return False
def is_retweet(self, tweet: tweepy.Tweet) -> bool:
"""
Checks if a recieved tweet has the referenced_tweets field
and if so checks if the type of the reference is a retweet
"""
return tweet.referenced_tweets is not None and tweet.referenced_tweets.pop().type == "retweeted"
def on_tweet(self, tweet: tweepy.Tweet):
# Check if the given tweet corresponds to the requirements
if self.is_retweet(tweet):
id = tweet.author_id
if id is not None and id not in self.ids and self.check_lang(tweet):
print(self.num_match)
self.ids[id] = tweet.lang
self.writer.writerow([id, tweet.lang, 0.0])
# self.counter += 1
if self.num_match['fr'] >= MAX_USER_PER_LANG and self.num_match['en'] >= MAX_USER_PER_LANG and self.num_match['tr'] >= MAX_USER_PER_LANG:
exit(0)
# Instanciate the Sampler class and begins sampling
with open(FILE_NAME, "w", newline='') as file:
csv_writer = csv.writer(file)
csv_writer.writerow(FIELD_NAMES)
sampler = Sampler(writer=csv_writer, bearer_token=config.BEARER_TOKEN)
sampler.sample(expansions=["author_id", "referenced_tweets.id"], tweet_fields="lang")
# t = twarc.client2.Twarc2(bearer_token=config.BEARER_TOKEN)
# tweets = t.sample()
# with open("sample2.json", "w") as f:
# for tweet in tweets:
# print(tweet)
# f.write(json.dumps(tweet)+"\n")