-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvaderTest.py
66 lines (51 loc) · 2 KB
/
vaderTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import re, string
import pandas as pd
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
stop_words = stopwords.words('english')
covidData = pd.read_csv("cTP2.csv")
def lemmatize_sentence(tokens):
lemmatizer = WordNetLemmatizer()
lemmatized_sentence = []
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
return lemmatized_sentence
def remove_noise(tweet_tokens, stop_words = ()):
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
'(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
token = re.sub("(@[A-Za-z0-9_]+)","", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
return cleaned_tokens
cDTweets = covidData[["user_location", "text"]]
vaderDatar = pd.DataFrame(columns=["location","text", "sentiment"])
analyzer = SentimentIntensityAnalyzer()
for i in cDTweets.iterrows():
try:
vs = analyzer.polarity_scores(i[1][1])
vDInsert = {"location": i[1][0], "text": i[1][1], "sentiment": vs["compound"]}
vaderDatar.loc[i[0]] = vDInsert
except TypeError:
print("Float is not iterable but we do not care")
print(i[0])
vaderDatar.to_csv("covidVD.csv")
print("to csv reached")