19
19
from utils .summarizer import *
20
20
from utils .clustering_analysis import *
21
21
22
- # ========================= [Load config] ===========================
23
- with open ("config.yaml" , "r" ) as f :
24
- config = yaml .load (f , Loader = yaml .FullLoader )
25
- config = Box (config )
22
+ def exe_by_sentences (text : str ):
23
+ # ========================= [Load config] ===========================
24
+ with open ("config.yaml" , "r" ) as f :
25
+ config = yaml .load (f , Loader = yaml .FullLoader )
26
+ config = Box (config )
26
27
27
- print ('Experiment name:' , config .experiment_name )
28
- print ('===============================================' )
29
-
30
- # ========================== [Run experiments] ==========================
31
- def summarize_and_visualization (text , config ):
32
- max_score = 0
33
- best_summary = ""
34
- best_index = 0
35
-
36
- evaluation_results = []
37
-
38
- init_s = time .time ()
39
28
40
29
# ========================== [Segmentation] ========================
41
30
print ("Segmentating... " , end = "" , flush = True )
@@ -55,12 +44,12 @@ def summarize_and_visualization(text, config):
55
44
avg_group_size = np .mean ([len (group ) for group in concat_indices ])
56
45
print (f"Num. of Cluster: { len (concat_indices )} , Max group size: { max_group_size } , Avg. group size: { avg_group_size :.2f} " )
57
46
58
- # 여기서 클러스터링한 걸 가지고 아래 불러온 함수에서 시각화를 합시다~~~
59
-
60
47
# ========================== [Ready to summarize] ==================
61
- batch_clusters = [ #주제별로 문장들이 합쳐져서 있음
48
+ batch_clusters = [
62
49
" " .join ([segments [gi ] for gi in group ]) for group in concat_indices
63
50
]
51
+
52
+ visualize_pth = analyze_text_clusters (segments , concat_indices )
64
53
65
54
# ========================== [Summarize] ===========================
66
55
print ("Summarizing... " , end = "" , flush = True )
@@ -71,78 +60,197 @@ def summarize_and_visualization(text, config):
71
60
config .mini_batch .size )
72
61
73
62
batch_summaries = []
74
- batch_token_importances = []
63
+ batch_importances = []
75
64
for i in range (0 , len (batch_clusters ), mini_batch_size ):
76
- mini_batch_summaries , mini_batch_token_importance = summarizer (batch_clusters [i :i + mini_batch_size ], ** config .summary .args )
65
+ mini_batch_summaries , mini_batch_importances = summarizer (batch_clusters [i :i + mini_batch_size ], cal_grad = True , ** config .summary .args )
77
66
batch_summaries .append (mini_batch_summaries )
78
- batch_token_importances .append (mini_batch_token_importance )
79
- batch_summaries = " " .join (batch_summaries )
80
- # token_importance를 합치거나 평균을 내는 로직이 필요할 수 있습니다.
81
- token_importance = np .mean (batch_token_importances , axis = 0 )
67
+ batch_importances .append (mini_batch_importances )
68
+ total_summaries = " " .join (batch_summaries )
82
69
else :
83
- batch_summaries , token_importance = summarizer (batch_clusters , ** config .summary .args )
70
+ batch_summaries = summarizer (batch_clusters , ** config .summary .args )
84
71
e = time .time ()
85
72
print ("Done" , f"{ e - s :.2f} sec" )
86
73
87
74
# ========================== [Evaluate] ============================
88
75
print ("Evaluating... " , end = "" , flush = True )
89
76
s = time .time ()
90
77
91
- rouge1 , rouge2 , rougeL = calculate_rouge_scores (text , batch_summaries )
92
- b_score = calculate_bert_score (text , batch_summaries )
78
+ rouge1 , rouge2 , rougeL = calculate_rouge_scores (text , total_summaries )
79
+ s_score = calculate_sementic_similarity (text , total_summaries )
93
80
94
81
# scale score * 100
95
82
rouge1 , rouge2 , rougeL = rouge1 * 100 , rouge2 * 100 , rougeL * 100
96
- b_score = b_score * 100
83
+ s_score = s_score * 100
84
+
85
+ e = time .time ()
86
+ print ("Done" , f"{ e - s :.2f} sec" )
97
87
98
- # ========================== [Post-process] ========================
99
- if b_score > max_score : # score는 대소비교 가능한 1가지 방식을 이용
100
- max_score = b_score
101
- best_summary = batch_summaries
102
- best_index = 0
88
+ print (f"=> ROUGE-1: { rouge1 :.2f} , ROUGE-2: { rouge2 :.2f} , ROUGE-L: { rougeL :.2f} " )
89
+ print (f"=> BERTScore: { s_score :.2f} " )
103
90
104
- evaluation_results . append ({
105
- 'summary' : batch_summaries ,
91
+ # ========================== [Post-process] ========================
92
+ evaluation_results = {
106
93
'rouge1' : rouge1 ,
107
94
'rouge2' : rouge2 ,
108
95
'rougeL' : rougeL ,
109
- 'bert_score' : b_score ,
110
- 'token_importance' : token_importance .tolist (),
111
- # 'visualization': visualization_path # 그래프 시각화 경로 추가 필요
112
- })
113
-
114
- # 모든 결과를 반환합니다.
115
- return evaluation_results
116
-
117
- def brushing_and_resummarize (datasets , config , selected_text ):
118
- """
119
- 사용자가 선택한 텍스트와 전체 텍스트의 유사도를 기반으로 요약을 생성.
120
-
121
- Args:
122
- - datasets (list of str): 전체 텍스트 리스트.
123
- - config (Box): 설정 객체. -> 솔직히 이거 왜 필요한가 싶음
124
- - selected_text (str): 사용자가 선택한 텍스트.
125
-
126
- Returns:
127
- - list of dict: 각 텍스트에 대한 요약 및 평가 결과.
128
- """
129
- results = []
130
- # 전체 텍스트를 문장 단위로 분할
131
- sentences = datasets .split ('. ' )
96
+ 'bert_score' : s_score
97
+ }
98
+
99
+ return batch_summaries , batch_importances , evaluation_results , visualize_pth
100
+
101
+ def resummarize_with_sentece (full_text : str , target_text : str ):
102
+ # ========================= [Load config] ===========================
103
+ with open ("config.yaml" , "r" ) as f :
104
+ config = yaml .load (f , Loader = yaml .FullLoader )
105
+ config = Box (config )
106
+
107
+ # ========================== [Segmentation] ========================
108
+ print ("Segmentating... " , end = "" , flush = True )
109
+ s = time .time ()
110
+ segments = segmentate_sentence (full_text , ** config .segment .args )
111
+ e = time .time ()
112
+ print ("Done" , f"{ e - s :.2f} sec" )
113
+
114
+ # ========================== [Filtering] ==========================
115
+ print ("Filtering... " , end = "" , flush = True )
116
+
117
+ filtered_text = []
118
+ for segment in segments :
119
+ if calculate_semantic_similarity (segment , target_text ) > 0.8 :
120
+ filtered_text .append (segment )
121
+
122
+ filtered_text = " " .append (filtered_text )
123
+
124
+ # ========================== [Summarize] ===========================
125
+ print ("Summarizing... " , end = "" , flush = True )
126
+ batch_summaries = summarizer (filtered_text , cal_grad = False , ** config .summary .args )
127
+
128
+ return batch_summaries
129
+
130
+ # # ========================= [Load config] ===========================
131
+ # with open("config.yaml", "r") as f:
132
+ # config = yaml.load(f, Loader=yaml.FullLoader)
133
+ # config = Box(config)
134
+
135
+ # print('Experiment name:', config.experiment_name)
136
+ # print('===============================================')
137
+
138
+ # # ========================== [Run experiments] ==========================
139
+ # def summarize_and_visualization(text, config):
140
+ # max_score = 0
141
+ # best_summary = ""
142
+ # best_index = 0
143
+
144
+ # evaluation_results = []
145
+
146
+ # init_s = time.time()
147
+
148
+ # # ========================== [Segmentation] ========================
149
+ # print("Segmentating... ", end="", flush=True)
150
+ # s = time.time()
151
+ # segments = segmentate_sentence(text, **config.segment.args)
152
+ # e = time.time()
153
+ # print("Done", f"{e-s:.2f} sec")
154
+
155
+ # # ========================== [Clustering] ==========================
156
+ # print("Clustering... ", end="", flush=True)
157
+ # s = time.time()
158
+ # concat_indices = globals()[config.concat.method](segments, **config.concat.args)
159
+ # e = time.time()
160
+ # print("Done", f"{e-s:.2f} sec")
161
+
162
+ # max_group_size = max([len(group) for group in concat_indices])
163
+ # avg_group_size = np.mean([len(group) for group in concat_indices])
164
+ # print(f"Num. of Cluster: {len(concat_indices)}, Max group size: {max_group_size}, Avg. group size: {avg_group_size:.2f}")
165
+
166
+ # # 여기서 클러스터링한 걸 가지고 아래 불러온 함수에서 시각화를 합시다~~~
167
+
168
+ # # ========================== [Ready to summarize] ==================
169
+ # batch_clusters = [ #주제별로 문장들이 합쳐져서 있음
170
+ # " ".join([segments[gi] for gi in group]) for group in concat_indices
171
+ # ]
172
+
173
+ # # ========================== [Summarize] ===========================
174
+ # print("Summarizing... ", end="", flush=True)
175
+ # s = time.time()
176
+ # if config.mini_batch.size > 0:
177
+ # mini_batch_size = (len(batch_clusters)
178
+ # if len(batch_clusters) < config.mini_batch.size else
179
+ # config.mini_batch.size)
180
+
181
+ # batch_summaries = []
182
+ # batch_token_importances = []
183
+ # for i in range(0, len(batch_clusters), mini_batch_size):
184
+ # mini_batch_summaries, mini_batch_token_importance = summarizer(batch_clusters[i:i+mini_batch_size], **config.summary.args)
185
+ # batch_summaries.append(mini_batch_summaries)
186
+ # batch_token_importances.append(mini_batch_token_importance)
187
+ # batch_summaries = " ".join(batch_summaries)
188
+ # # token_importance를 합치거나 평균을 내는 로직이 필요할 수 있습니다.
189
+ # token_importance = np.mean(batch_token_importances, axis=0)
190
+ # else:
191
+ # batch_summaries, token_importance = summarizer(batch_clusters, **config.summary.args)
192
+ # e = time.time()
193
+ # print("Done", f"{e-s:.2f} sec")
194
+
195
+ # # ========================== [Evaluate] ============================
196
+ # print("Evaluating... ", end="", flush=True)
197
+ # s = time.time()
198
+
199
+ # rouge1, rouge2, rougeL = calculate_rouge_scores(text, batch_summaries)
200
+ # b_score = calculate_bert_score(text, batch_summaries)
201
+
202
+ # # scale score * 100
203
+ # rouge1, rouge2, rougeL = rouge1*100, rouge2*100, rougeL*100
204
+ # b_score = b_score * 100
205
+
206
+ # # ========================== [Post-process] ========================
207
+ # if b_score > max_score: # score는 대소비교 가능한 1가지 방식을 이용
208
+ # max_score = b_score
209
+ # best_summary = batch_summaries
210
+ # best_index = 0
211
+
212
+ # evaluation_results.append({
213
+ # 'summary': batch_summaries,
214
+ # 'rouge1': rouge1,
215
+ # 'rouge2': rouge2,
216
+ # 'rougeL': rougeL,
217
+ # 'bert_score': b_score,
218
+ # 'token_importance': token_importance.tolist(),
219
+ # # 'visualization': visualization_path # 그래프 시각화 경로 추가 필요
220
+ # })
221
+
222
+ # # 모든 결과를 반환합니다.
223
+ # return evaluation_results
224
+
225
+ # def brushing_and_resummarize(datasets, config, selected_text):
226
+ # """
227
+ # 사용자가 선택한 텍스트와 전체 텍스트의 유사도를 기반으로 요약을 생성.
228
+
229
+ # Args:
230
+ # - datasets (list of str): 전체 텍스트 리스트.
231
+ # - config (Box): 설정 객체. -> 솔직히 이거 왜 필요한가 싶음
232
+ # - selected_text (str): 사용자가 선택한 텍스트.
233
+
234
+ # Returns:
235
+ # - list of dict: 각 텍스트에 대한 요약 및 평가 결과.
236
+ # """
237
+ # results = []
238
+ # # 전체 텍스트를 문장 단위로 분할
239
+ # sentences = datasets.split('. ')
132
240
133
- # 각 문장과 선택된 텍스트의 유사도 계산
134
- similarities = [calculate_semantic_similarity (sentence , selected_text ) for sentence in sentences ]
241
+ # # 각 문장과 선택된 텍스트의 유사도 계산
242
+ # similarities = [calculate_semantic_similarity(sentence, selected_text) for sentence in sentences]
135
243
136
- # 유사도가 높은 순으로 정렬하여 상위 n개의 문장 선택
137
- n = 3 # 요약에 포함할 문장 수
138
- top_sentences = [sentences [i ] for i in np .argsort (similarities )[- n :]]
244
+ # # 유사도가 높은 순으로 정렬하여 상위 n개의 문장 선택
245
+ # n = 3 # 요약에 포함할 문장 수
246
+ # top_sentences = [sentences[i] for i in np.argsort(similarities)[-n:]]
139
247
140
- # 선택된 문장들을 결합하여 요약 생성
141
- summary = '. ' .join (top_sentences )
248
+ # # 선택된 문장들을 결합하여 요약 생성
249
+ # summary = '. '.join(top_sentences)
142
250
143
251
144
- result = {
145
- 'summary' : summary ,
146
- }
252
+ # result = {
253
+ # 'summary': summary,
254
+ # }
147
255
148
- return result
256
+ # return result
0 commit comments