forked from networkdynamics/BattleBotsTemplate
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerateTypos.py
274 lines (218 loc) ยท 13 KB
/
generateTypos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import random
import typo
import math
import re
class TypoGenerator:
def __init__(self, text, seed=None):
self.text = text
self.result = text
self.seed = seed if seed is not None else random.randint(0, 10**6)
random.seed(self.seed) # Set the random seed
self.myStrErrer = typo.StrErrer(text,seed=self.seed)
def char_swap(self, preservefirst=False, preservelast=False):
# Assume implementation exists
self.result=self.myStrErrer.char_swap().result
def missing_char(self, preservefirst=False, preservelast=False):
# Assume implementation exists
self.result=self.myStrErrer.missing_char().result
def extra_char(self, preservefirst=False, preservelast=False):
# Assume implementation exists
self.result=self.myStrErrer.extra_char().result
def nearby_char(self, preservefirst=False, preservelast=False):
# Assume implementation exists
self.result=self.myStrErrer.nearby_char().result
def similar_char(self):
# Assume implementation exists
self.result=self.myStrErrer.similar_char().result
def skipped_space(self):
# Assume implementation exists
self.result=self.myStrErrer.skipped_space().result
def random_space(self):
# Assume implementation exists
self.result=self.myStrErrer.random_space().result
def repeated_char(self):
# Assume implementation exists
self.result=self.myStrErrer.repeated_char().result
def unichar(self):
# Assume implementation exists
self.result=self.myStrErrer.unichar().result
def determine_typo_count(self):
"""Determine the number of typos based on the text length."""
chars_per_100 = len(self.text) / 100
mean_typos = 0.9
# std_dev_typos = 0.428571
std_dev_typos = .42
# Sample from normal distribution
typo_count = max(0, round(random.gauss(mean_typos * chars_per_100, std_dev_typos * chars_per_100)))
return typo_count
def apply_typo(self):
"""Applies a random typo based on predefined probabilities."""
typo_functions = [
(self.char_swap, 0.15), # 15% chance
(self.missing_char, 0.15), # 15% chance
(self.extra_char, 0.10), # 10% chance
(self.nearby_char, 0.10), # 10% chance
(self.similar_char, 0.10), # 10% chance
(self.skipped_space, 0.10), # 10% chance
(self.random_space, 0.10), # 10% chance
(self.repeated_char, 0.10), # 10% chance
(self.unichar, 0.10), # 10% chance
]
# Weighted sampling to select a function
functions, weights = zip(*typo_functions)
selected_function = random.choices(functions, weights, k=1)[0]
# print(selected_function)
# Apply the selected function
selected_function()
def apply_multiple_typos(self):
"""Applies typos to the string based on its length."""
# Determine the number of typos
num_typos = self.determine_typo_count()
# print(num_typos)
for _ in range(num_typos):
self.apply_typo()
return self.result
class TweetAugmenter:
def __init__(self, seed=None):
self.seed = seed if seed is not None else random.randint(0, 10**6)
random.seed(self.seed)
# def add_emojis(self, text):
# """Adds or multiplies emojis in the text based on probabilities, ensuring multiplication happens only once."""
# all_emojis = ["๐", "๐ฅ", "๐", "๐", "๐", "โค๏ธ", "๐", "๐ข", "๐ก", "๐", "๐ค", "๐", "๐", "๐ฏ", "๐", "๐", "๐"]
# emoji_pattern = r"[^\w\s,]"
# emojis = re.findall(emoji_pattern, text) # Find all existing emojis
# emoji_multiplier_applied = False # Ensure multiplication happens only once
# if emojis: # If emojis exist in the text
# if random.random() < 0.6 and not emoji_multiplier_applied: # 60% chance to multiply existing emojis
# # Multiply the first emoji found
# emoji_to_multiply = emojis[0]
# additional_emojis = " ".join(random.choices(all_emojis, k=random.randint(1, 2)))
# text = text.replace(emoji_to_multiply, f"{emoji_to_multiply} {additional_emojis}", 1)
# emoji_multiplier_applied = True
# elif random.random() < 0.5: # 50% chance to add emojis if none exist
# num_emojis = random.randint(1,4)
# new_emojis = " ".join(random.choices(all_emojis, k=num_emojis))
# text += f" {new_emojis}"
# return text
#expand the emojis to all possible, and don't just add, when you see one add between 1-4 others next to it with 60% chance.
def add_hashtags(self, text):
if random.random() < 0.60: # 20% chance
words = text.split()
num_hashtags = random.randint(1, 5)
hashtagged_words = random.sample(words, k=min(num_hashtags, len(words)))
for word in hashtagged_words:
text = text.replace(word, f"#{word}")
return text
def double_links(self, text):
link_pattern = r"https://t.co/twitter_link"
links = re.findall(link_pattern, text)
if links and random.random() < 0.30: # 15% chance
doubled_links = [f"{link} {link}" for link in links]
for original, doubled in zip(links, doubled_links):
text = text.replace(original, doubled, 1)
return text
def fix_links(self, text):
words = text.split()
updated_words = [
"https://t.co/twitter_link" if "://" in word else word for word in words
]
return " ".join(updated_words)
def add_mentions(self, text):
if random.random() < 0.40: # 10% chance
num_mentions = random.randint(1, 3)
mentions = [f"@mention" for _ in range(num_mentions)]
text += " " + " ".join(mentions)
return text
def exaggerate_words(self, text):
"""Exaggerates punctuation or the ending letters of a word."""
words = text.split()
# Choose one word or punctuation to exaggerate (if applicable)
indices_to_exaggerate = [
i for i, word in enumerate(words)
if word.endswith(("!", "?")) or len(word) > 1
]
if indices_to_exaggerate:
# Randomly select one word/punctuation to exaggerate
index = random.choice(indices_to_exaggerate)
# If the word ends with "!" or "?", exaggerate the punctuation
if words[index].endswith(("!", "?")):
punctuation = words[index][-1] # Get the last character (! or ?)
exaggeration = punctuation * random.randint(1, 5) # Repeat 1-5 times
words[index] += exaggeration
# Otherwise, exaggerate the last character of the word
elif random.random() < 0.05: # 5% chance
words[index] += words[index][-1] * random.randint(2, 6)
return " ".join(words)
#don't just randomly add an exclamation mark. when you see exclamation or question mark, expand them by adding multiple next to it (with 20% chance). same for extra letters to end of a word.
def change_case(self, text):
words = text.split()
for i in range(len(words)):
if random.random() < 0.1: # 5% chance
if words[i] != "https://t.co/twitter_link":words[i] = words[i].upper()
elif random.random() < 0.05: # 1% chance
if words[i] != "https://t.co/twitter_link":words[i] = ''.join(c.upper() if random.random() < 0.5 else c.lower() for c in words[i])
return " ".join(words)
def randomize_sentence_case(self, text):
if random.random() < 0.8: # 80% chance
text = text[0].lower() + text[1:] if text else text
return text
def process_tweet(self, tweet):
typo_gen = TypoGenerator(tweet, seed=self.seed)
tweet = typo_gen.apply_multiple_typos()
# tweet = self.add_emojis(tweet)
tweet = self.add_hashtags(tweet)
tweet = self.double_links(tweet)
tweet = self.add_mentions(tweet)
tweet = self.exaggerate_words(tweet)
tweet = self.change_case(tweet)
tweet = self.randomize_sentence_case(tweet)
tweet = self.fix_links(tweet)
return tweet
# make sure when adding typos to the tweets that the typos do not apply on words "@mention" or "https://t.co/twitter_link"
def process_tweets(self, tweets):
return [self.process_tweet(tweet) for tweet in tweets]
# Example Usage
# tweets = ["Imagine combining the best aspects of today's pop icons into one artist!? Dreams could come true! โจ", 'Spotted Mahomes enjoying time courtside! Such vibes ๐ฅ', "Nominees are in, and OMGโI can't even choose the best one this week! ๐๏ธ", 'Just won a small victory, and it feels good! ๐ช https://t.co/twitter_link', 'Definitely excited for what comes next! ๐', 'Getting super into DIY home decor latelyโso many ideas! ๐ ๏ธ๐ก', 'Wrapping up the season rewatch and feeling nostalgic. Some episodes were definitely underrated! ๐บ https://t.co/twitter_link', 'The way my team played in the first half had me screaming at the TV. Come on guys! #StayFocused', "Whoa, that angle was wild! My brain can't handle all this creativity! ๐", 'Tried starting my day productive but ended up napping insteadโฆ oops! ๐ค', "Our team's skating looks sharp these days! ๐ Bring it on!", "Feeling something special about tonight's game, I just know itโs going to be memorable #LetsGoTeam", "Adrenaline pumping; I'm always ready to face challenges head-on! ๐ฅ๐ซถ๐พ https://t.co/twitter_link", "Who's streaming the latest movie releases? Drop your recommendations! ๐ฌ๐", "It's funny how trends come back aroundโremember when we all wore those baggy jeans? ๐ #Throwback https://t.co/twitter_link", 'Only way to get through this week is to find some cursed lyrics to keep me entertained ๐ https://t.co/twitter_link', 'Gotta share this gem I stumbled upon after finishing a great movie! ๐ @mention', 'Was that play unexpected or what?! The energy was palpable out there! ๐๐', 'What if we did something completely radical and reinvented everything? Just a thought ๐ญ https://t.co/twitter_link', "Can't stress enough how epic that last action scene was! ๐ฅ๐ฟ https://t.co/twitter_link", 'That penalty call was absolutely ridiculous ๐ #GameDay', 'Fantasy trade proposition got everyone talkingโwhat are your predictions? ๐ฎ', 'Current mood: spontaneous adventures are the best kind of therapy ๐โจ', 'Ayy, anyone else craving congolese food? Just hit differently! ๐๐ฟ https://t.co/twitter_link', 'People sleeping on this track owe it another listenโitโs actually fire ๐ฅ https://t.co/twitter_link', "Rumors swirling about health checks pre-tradesโwhat's going on in Hollywood?", 'Simplifying life is usually the key to happiness ๐ https://t.co/twitter_link']
def augmentTweets(tweets):
augmenter = TweetAugmenter()
augmented_tweets = augmenter.process_tweets(tweets)
return augmented_tweets
# for original, augmented in zip(tweets, augmented_tweets):
# print(f"Original: {original}")
# print(f"Augmented: {augmented}")
# print("-" * 50)
# Example Usage
# typo_gen = TypoGenerator("oh my god who is this dickhead on espn2 complaining about the refs reviewing a temple player getting hit in the face to declare a flagrant foul is ruining the sport")
# output = typo_gen.apply_multiple_typos()
# print(f"Original: oh my god who is this dickhead on espn2 complaining about the refs reviewing a temple player getting hit in the face to declare a flagrant foul is ruining the sport")
# print(f"With Typos: {output}")
#INSTRUCTIONS
#typos:
#using the above class, add typos to the sentence as shown above. however, when seeing a link "https://t.co/twitter_link" don't apply typos to the link.
#INSERT CLASS HERE
#emojis:
#when you see only one emoji, add btwn 1-4 50% of the time.
#INSERT CLASS HERE
#hashstags
# add hashtags by hashtagging btwn 1-4 words 20% of the time.
# only allow 10% of sentences to have only 1 hashtag.
#INSERT CLASS HERE
#links:
# 15% of the time double up links by adding another "https://t.co/twitter_link" after the one that is there.
#INSERT CLASS HERE
# mentions
# assure ~10% of tweets have mentions. 50% of those should have >1 mention (btwn 1-4).
#INSERT CLASS HERE
#special chars:
#when you see
#INSERT CLASS HERE
#sentence structure:
#don't always start the tweets with capital letters.
#INSERT CLASS HERE
#words
#btwn 0.5 and 5% of words can be exagerated by adding many extra chars (btwn 1-10 repetitions) to the end of the word. i.e. hi becomes hiiii
# btwn 0.5 and 5% of words can be ALL UPPERCASE
#btwn 0.5 and 1% of words can be iNtErCHAngEd uPPerCase LIkE thIs
#INSERT CLASS HERE
#bring it all together
#create a function that takes in as input an array of tweets (string sentences), and applies all the above classes to it taking in mind the mentioned probabilities.