-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_diction.py
80 lines (72 loc) · 3.06 KB
/
train_diction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#coding=utf-8
'''
根据链接http://blog.csdn.net/u012052268/article/details/77825981介绍,
把原始的训练数据里面的主题拿出来,作为一个一个的词制作成自定义字典,这样
再次对content识别的时候,就会把出现的词作为一个整体主题拿去判断,减少了
产生主题不一致的情况
把原始数据的情感关键词也拿出来,做自定义词典
'''
import jieba
import pandas as pd
dic_theme={}
dic_sentiment={}
#将df每一行中的主题用字典存储
def seg_sentence(sentence):
words = str(sentence).replace("NULL;","").split(';')
for w in words:
if w not in dic_theme.keys() and "NULL"!=w and ""!=w:
dic_theme[w]=1
#将df每一行中的情感词用字典存储
def split_sentence(sentence):
words = str(sentence).split(";")
for w in words:
if w not in dic_sentiment.keys() and ""!=w:
dic_sentiment[w]=1
#将数据里面的主题,输出到txt里面存储
def theme_tiqu(filename='data/train.xlsx'):
df = pd.read_excel(filename)
# 去掉主题是空行的,就是啥都没有的那些数据
NONE_VIN = (df["theme-主题"].isnull()) | (df["theme-主题"].apply(lambda x: str(x).strip("NULL;").isspace()))
df_not_null = df[~NONE_VIN]
df_not_null['theme-主题'].apply(seg_sentence)
write_dic('data/theme_words.txt',dic_theme)
#将数据里面的情感词输出输出到txt中存储
def sentiment_word_tiqu(filename='data/train.xlsx'):
df = pd.read_excel(filename)
df["sentiment_word-情感关键词"].apply(split_sentence)
write_dic('data/sentiment_words.txt',dic_sentiment)
#将字典打印出来,只打印字典的key,即字典以主题词或者情感词为key的名称
def write_dic(file_path,dic):
file_object = open(file_path, 'w',encoding='UTF-8')
for w in dic:
if dic[w]==1:
file_object.write(w+"\n")
else:
continue
file_object.close()
print("write finish!!!")
#打印情感词的名称以及情感词该名称所对应的情感
def split_sentiment(filename='data/train.xlsx'):
df = pd.read_excel(filename)
#df.loc[:, [u'sentiment_word-情感关键词', u'sentiment_anls-情感正负面']].apply(test)
for i in range(len(df)):
ss = str(df.loc[i,'sentiment_word-情感关键词']).split(";")
tt = str(df.loc[i,'sentiment_anls-情感正负面']).split(";")
for i in range(len(ss)):
if ss[i] not in dic_sentiment.keys() and ""!=ss[i]:
dic_sentiment[ss[i]]=tt[i]
write_sentiment('data/sentiment_words_正负面.txt',dic_sentiment)
def write_sentiment(file_path,dic):
file_object = open(file_path, 'w', encoding='UTF-8')
for w in dic:
file_object.write(w + " "+dic[w]+"\n")
file_object.close()
print("write finish!!!")
# jieba.load_userdict('userdict.txt')
theme_tiqu()
sentiment_word_tiqu()
#df = pd.read_excel('data/train.xlsx')
#提取这两列数据
#df['sentiment_anls-情感正负面'].apply(test)
#df.loc[:,[u'sentiment_word-情感关键词',u'sentiment_anls-情感正负面']].apply(test)
split_sentiment()