Skip to content

Commit 0f9cf0f

Browse files
authored
Update exe_by_sentences, resummarize_with_sentece
1 parent b2bb891 commit 0f9cf0f

File tree

1 file changed

+178
-70
lines changed

1 file changed

+178
-70
lines changed

mainrun.py

+178-70
Original file line numberDiff line numberDiff line change
@@ -19,23 +19,12 @@
1919
from utils.summarizer import *
2020
from utils.clustering_analysis import *
2121

22-
# ========================= [Load config] ===========================
23-
with open("config.yaml", "r") as f:
24-
config = yaml.load(f, Loader=yaml.FullLoader)
25-
config = Box(config)
22+
def exe_by_sentences(text: str):
23+
# ========================= [Load config] ===========================
24+
with open("config.yaml", "r") as f:
25+
config = yaml.load(f, Loader=yaml.FullLoader)
26+
config = Box(config)
2627

27-
print('Experiment name:', config.experiment_name)
28-
print('===============================================')
29-
30-
# ========================== [Run experiments] ==========================
31-
def summarize_and_visualization(text, config):
32-
max_score = 0
33-
best_summary = ""
34-
best_index = 0
35-
36-
evaluation_results = []
37-
38-
init_s = time.time()
3928

4029
# ========================== [Segmentation] ========================
4130
print("Segmentating... ", end="", flush=True)
@@ -55,12 +44,12 @@ def summarize_and_visualization(text, config):
5544
avg_group_size = np.mean([len(group) for group in concat_indices])
5645
print(f"Num. of Cluster: {len(concat_indices)}, Max group size: {max_group_size}, Avg. group size: {avg_group_size:.2f}")
5746

58-
# 여기서 클러스터링한 걸 가지고 아래 불러온 함수에서 시각화를 합시다~~~
59-
6047
# ========================== [Ready to summarize] ==================
61-
batch_clusters = [ #주제별로 문장들이 합쳐져서 있음
48+
batch_clusters = [
6249
" ".join([segments[gi] for gi in group]) for group in concat_indices
6350
]
51+
52+
visualize_pth = analyze_text_clusters(segments, concat_indices)
6453

6554
# ========================== [Summarize] ===========================
6655
print("Summarizing... ", end="", flush=True)
@@ -71,78 +60,197 @@ def summarize_and_visualization(text, config):
7160
config.mini_batch.size)
7261

7362
batch_summaries = []
74-
batch_token_importances = []
63+
batch_importances = []
7564
for i in range(0, len(batch_clusters), mini_batch_size):
76-
mini_batch_summaries, mini_batch_token_importance = summarizer(batch_clusters[i:i+mini_batch_size], **config.summary.args)
65+
mini_batch_summaries, mini_batch_importances = summarizer(batch_clusters[i:i+mini_batch_size], cal_grad=True, **config.summary.args)
7766
batch_summaries.append(mini_batch_summaries)
78-
batch_token_importances.append(mini_batch_token_importance)
79-
batch_summaries = " ".join(batch_summaries)
80-
# token_importance를 합치거나 평균을 내는 로직이 필요할 수 있습니다.
81-
token_importance = np.mean(batch_token_importances, axis=0)
67+
batch_importances.append(mini_batch_importances)
68+
total_summaries = " ".join(batch_summaries)
8269
else:
83-
batch_summaries, token_importance = summarizer(batch_clusters, **config.summary.args)
70+
batch_summaries = summarizer(batch_clusters, **config.summary.args)
8471
e = time.time()
8572
print("Done", f"{e-s:.2f} sec")
8673

8774
# ========================== [Evaluate] ============================
8875
print("Evaluating... ", end="", flush=True)
8976
s = time.time()
9077

91-
rouge1, rouge2, rougeL = calculate_rouge_scores(text, batch_summaries)
92-
b_score = calculate_bert_score(text, batch_summaries)
78+
rouge1, rouge2, rougeL = calculate_rouge_scores(text, total_summaries)
79+
s_score = calculate_sementic_similarity(text, total_summaries)
9380

9481
# scale score * 100
9582
rouge1, rouge2, rougeL = rouge1*100, rouge2*100, rougeL*100
96-
b_score = b_score * 100
83+
s_score = s_score * 100
84+
85+
e = time.time()
86+
print("Done", f"{e-s:.2f} sec")
9787

98-
# ========================== [Post-process] ========================
99-
if b_score > max_score: # score는 대소비교 가능한 1가지 방식을 이용
100-
max_score = b_score
101-
best_summary = batch_summaries
102-
best_index = 0
88+
print(f"=> ROUGE-1: {rouge1:.2f}, ROUGE-2: {rouge2:.2f}, ROUGE-L: {rougeL:.2f}")
89+
print(f"=> BERTScore: {s_score:.2f}")
10390

104-
evaluation_results.append({
105-
'summary': batch_summaries,
91+
# ========================== [Post-process] ========================
92+
evaluation_results= {
10693
'rouge1': rouge1,
10794
'rouge2': rouge2,
10895
'rougeL': rougeL,
109-
'bert_score': b_score,
110-
'token_importance': token_importance.tolist(),
111-
# 'visualization': visualization_path # 그래프 시각화 경로 추가 필요
112-
})
113-
114-
# 모든 결과를 반환합니다.
115-
return evaluation_results
116-
117-
def brushing_and_resummarize(datasets, config, selected_text):
118-
"""
119-
사용자가 선택한 텍스트와 전체 텍스트의 유사도를 기반으로 요약을 생성.
120-
121-
Args:
122-
- datasets (list of str): 전체 텍스트 리스트.
123-
- config (Box): 설정 객체. -> 솔직히 이거 왜 필요한가 싶음
124-
- selected_text (str): 사용자가 선택한 텍스트.
125-
126-
Returns:
127-
- list of dict: 각 텍스트에 대한 요약 및 평가 결과.
128-
"""
129-
results = []
130-
# 전체 텍스트를 문장 단위로 분할
131-
sentences = datasets.split('. ')
96+
'bert_score': s_score
97+
}
98+
99+
return batch_summaries, batch_importances, evaluation_results, visualize_pth
100+
101+
def resummarize_with_sentece(full_text: str, target_text: str):
102+
# ========================= [Load config] ===========================
103+
with open("config.yaml", "r") as f:
104+
config = yaml.load(f, Loader=yaml.FullLoader)
105+
config = Box(config)
106+
107+
# ========================== [Segmentation] ========================
108+
print("Segmentating... ", end="", flush=True)
109+
s = time.time()
110+
segments = segmentate_sentence(full_text, **config.segment.args)
111+
e = time.time()
112+
print("Done", f"{e-s:.2f} sec")
113+
114+
# ========================== [Filtering] ==========================
115+
print("Filtering... ", end="", flush=True)
116+
117+
filtered_text = []
118+
for segment in segments:
119+
if calculate_semantic_similarity(segment, target_text) > 0.8:
120+
filtered_text.append(segment)
121+
122+
filtered_text = " ".append(filtered_text)
123+
124+
# ========================== [Summarize] ===========================
125+
print("Summarizing... ", end="", flush=True)
126+
batch_summaries = summarizer(filtered_text, cal_grad=False, **config.summary.args)
127+
128+
return batch_summaries
129+
130+
# # ========================= [Load config] ===========================
131+
# with open("config.yaml", "r") as f:
132+
# config = yaml.load(f, Loader=yaml.FullLoader)
133+
# config = Box(config)
134+
135+
# print('Experiment name:', config.experiment_name)
136+
# print('===============================================')
137+
138+
# # ========================== [Run experiments] ==========================
139+
# def summarize_and_visualization(text, config):
140+
# max_score = 0
141+
# best_summary = ""
142+
# best_index = 0
143+
144+
# evaluation_results = []
145+
146+
# init_s = time.time()
147+
148+
# # ========================== [Segmentation] ========================
149+
# print("Segmentating... ", end="", flush=True)
150+
# s = time.time()
151+
# segments = segmentate_sentence(text, **config.segment.args)
152+
# e = time.time()
153+
# print("Done", f"{e-s:.2f} sec")
154+
155+
# # ========================== [Clustering] ==========================
156+
# print("Clustering... ", end="", flush=True)
157+
# s = time.time()
158+
# concat_indices = globals()[config.concat.method](segments, **config.concat.args)
159+
# e = time.time()
160+
# print("Done", f"{e-s:.2f} sec")
161+
162+
# max_group_size = max([len(group) for group in concat_indices])
163+
# avg_group_size = np.mean([len(group) for group in concat_indices])
164+
# print(f"Num. of Cluster: {len(concat_indices)}, Max group size: {max_group_size}, Avg. group size: {avg_group_size:.2f}")
165+
166+
# # 여기서 클러스터링한 걸 가지고 아래 불러온 함수에서 시각화를 합시다~~~
167+
168+
# # ========================== [Ready to summarize] ==================
169+
# batch_clusters = [ #주제별로 문장들이 합쳐져서 있음
170+
# " ".join([segments[gi] for gi in group]) for group in concat_indices
171+
# ]
172+
173+
# # ========================== [Summarize] ===========================
174+
# print("Summarizing... ", end="", flush=True)
175+
# s = time.time()
176+
# if config.mini_batch.size > 0:
177+
# mini_batch_size = (len(batch_clusters)
178+
# if len(batch_clusters) < config.mini_batch.size else
179+
# config.mini_batch.size)
180+
181+
# batch_summaries = []
182+
# batch_token_importances = []
183+
# for i in range(0, len(batch_clusters), mini_batch_size):
184+
# mini_batch_summaries, mini_batch_token_importance = summarizer(batch_clusters[i:i+mini_batch_size], **config.summary.args)
185+
# batch_summaries.append(mini_batch_summaries)
186+
# batch_token_importances.append(mini_batch_token_importance)
187+
# batch_summaries = " ".join(batch_summaries)
188+
# # token_importance를 합치거나 평균을 내는 로직이 필요할 수 있습니다.
189+
# token_importance = np.mean(batch_token_importances, axis=0)
190+
# else:
191+
# batch_summaries, token_importance = summarizer(batch_clusters, **config.summary.args)
192+
# e = time.time()
193+
# print("Done", f"{e-s:.2f} sec")
194+
195+
# # ========================== [Evaluate] ============================
196+
# print("Evaluating... ", end="", flush=True)
197+
# s = time.time()
198+
199+
# rouge1, rouge2, rougeL = calculate_rouge_scores(text, batch_summaries)
200+
# b_score = calculate_bert_score(text, batch_summaries)
201+
202+
# # scale score * 100
203+
# rouge1, rouge2, rougeL = rouge1*100, rouge2*100, rougeL*100
204+
# b_score = b_score * 100
205+
206+
# # ========================== [Post-process] ========================
207+
# if b_score > max_score: # score는 대소비교 가능한 1가지 방식을 이용
208+
# max_score = b_score
209+
# best_summary = batch_summaries
210+
# best_index = 0
211+
212+
# evaluation_results.append({
213+
# 'summary': batch_summaries,
214+
# 'rouge1': rouge1,
215+
# 'rouge2': rouge2,
216+
# 'rougeL': rougeL,
217+
# 'bert_score': b_score,
218+
# 'token_importance': token_importance.tolist(),
219+
# # 'visualization': visualization_path # 그래프 시각화 경로 추가 필요
220+
# })
221+
222+
# # 모든 결과를 반환합니다.
223+
# return evaluation_results
224+
225+
# def brushing_and_resummarize(datasets, config, selected_text):
226+
# """
227+
# 사용자가 선택한 텍스트와 전체 텍스트의 유사도를 기반으로 요약을 생성.
228+
229+
# Args:
230+
# - datasets (list of str): 전체 텍스트 리스트.
231+
# - config (Box): 설정 객체. -> 솔직히 이거 왜 필요한가 싶음
232+
# - selected_text (str): 사용자가 선택한 텍스트.
233+
234+
# Returns:
235+
# - list of dict: 각 텍스트에 대한 요약 및 평가 결과.
236+
# """
237+
# results = []
238+
# # 전체 텍스트를 문장 단위로 분할
239+
# sentences = datasets.split('. ')
132240

133-
# 각 문장과 선택된 텍스트의 유사도 계산
134-
similarities = [calculate_semantic_similarity(sentence, selected_text) for sentence in sentences]
241+
# # 각 문장과 선택된 텍스트의 유사도 계산
242+
# similarities = [calculate_semantic_similarity(sentence, selected_text) for sentence in sentences]
135243

136-
# 유사도가 높은 순으로 정렬하여 상위 n개의 문장 선택
137-
n = 3 # 요약에 포함할 문장 수
138-
top_sentences = [sentences[i] for i in np.argsort(similarities)[-n:]]
244+
# # 유사도가 높은 순으로 정렬하여 상위 n개의 문장 선택
245+
# n = 3 # 요약에 포함할 문장 수
246+
# top_sentences = [sentences[i] for i in np.argsort(similarities)[-n:]]
139247

140-
# 선택된 문장들을 결합하여 요약 생성
141-
summary = '. '.join(top_sentences)
248+
# # 선택된 문장들을 결합하여 요약 생성
249+
# summary = '. '.join(top_sentences)
142250

143251

144-
result = {
145-
'summary': summary,
146-
}
252+
# result = {
253+
# 'summary': summary,
254+
# }
147255

148-
return result
256+
# return result

0 commit comments

Comments
 (0)