From 3441b1226c2fbe1237297f4d8a7fa01bac2520a8 Mon Sep 17 00:00:00 2001 From: pekopoke <1135796875@qq.com> Date: Thu, 18 Sep 2025 09:45:23 +0800 Subject: [PATCH 1/2] no --- examples/multi_extractor_compare.py | 2 +- scripts/statics.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/multi_extractor_compare.py b/examples/multi_extractor_compare.py index 301bc2d..0db0642 100644 --- a/examples/multi_extractor_compare.py +++ b/examples/multi_extractor_compare.py @@ -8,7 +8,7 @@ def all_extractor_comparison(): print("\n=== 多抽取器对比演示 ===\n") # 创建数据集 - dataset_path = Path("/home/lulindong/Pycharm_projects/cc/1827_split_jsonl/1-200.jsonl") + dataset_path = Path("/home/lulindong/Pycharm_projects/cc/7887/1-200+245_formula.jsonl") dataset = DataLoader.load_jsonl(dataset_path) # 创建webkit抽取器 diff --git a/scripts/statics.py b/scripts/statics.py index 56b8b7b..94d3042 100644 --- a/scripts/statics.py +++ b/scripts/statics.py @@ -1067,9 +1067,9 @@ def main(): # data/sample_dataset.jsonl # data/WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl parser.add_argument("input_file", nargs='?', - default="data/WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl", + default="/home/lulindong/Pycharm_projects/cc/WebMainBench_test_WebMainBench_7887_language_output_with_stats.jsonl", help="输入JSONL文件路径") - parser.add_argument("--output", default="data/sample_dataset_with_stats.jsonl", type=str, help="输出文件路径") + parser.add_argument("--output", default="/home/lulindong/Pycharm_projects/cc/7887_meta.jsonl", type=str, help="输出文件路径") args = parser.parse_args() From f6b84ebd21a19681ac1b729c174af10c0f1f7e27 Mon Sep 17 00:00:00 2001 From: pekopoke <1135796875@qq.com> Date: Mon, 13 Oct 2025 11:51:37 +0800 Subject: [PATCH 2/2] update trafilatura extract txt --- examples/multi_extractor_compare.py | 6 +- requirements.txt | 3 +- webmainbench/data/saver.py | 1 + .../extractors/trafilatura_extractor.py | 2 +- .../extractors/trafilatura_txt_extractor.py | 132 ++++++++++++++++++ webmainbench/metrics/base.py | 2 +- 6 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 webmainbench/extractors/trafilatura_txt_extractor.py diff --git a/examples/multi_extractor_compare.py b/examples/multi_extractor_compare.py index b2f3258..98d27a1 100644 --- a/examples/multi_extractor_compare.py +++ b/examples/multi_extractor_compare.py @@ -19,14 +19,16 @@ def all_extractor_comparison(): webkit_extractor = ExtractorFactory.create("llm-webkit", config=config) # 创建magic-extractor抽取器 magic_extractor = ExtractorFactory.create("magic-html") - # 创建trafilatura抽取器 + # 创建trafilatura抽取器,抽取成markdown trafilatura_extractor = ExtractorFactory.create("trafilatura") + # 创建trafilatura抽取器,抽取成txt + trafilatura_txt_extractor = ExtractorFactory.create("trafilatura_txt") # 创建resiliparse抽取器 resiliparse_extractor = ExtractorFactory.create("resiliparse") # 运行对比 evaluator = Evaluator() - extractors = [webkit_extractor, magic_extractor, trafilatura_extractor, resiliparse_extractor] + extractors = [webkit_extractor, magic_extractor, trafilatura_extractor,trafilatura_txt_extractor, resiliparse_extractor] # extractors = [webkit_extractor] diff --git a/requirements.txt b/requirements.txt index d9b3aeb..feaf240 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ https://github.com/opendatalab/magic-html/releases/download/magic_html-0.1.5-rel streamlit markdown jieba -apted \ No newline at end of file +apted +openai \ No newline at end of file diff --git a/webmainbench/data/saver.py b/webmainbench/data/saver.py index 247dc4f..75640ed 100644 --- a/webmainbench/data/saver.py +++ b/webmainbench/data/saver.py @@ -153,6 +153,7 @@ def to_dict_if_needed(item): 'llm-webkit': 'llm_web_kit', 'magic-html': 'magic_html', 'trafilatura': 'trafilatura', + 'trafilatura_txt': 'trafilatura', 'resiliparse': 'resiliparse' } package_name = package_mapping.get(extractor_name, extractor_name) diff --git a/webmainbench/extractors/trafilatura_extractor.py b/webmainbench/extractors/trafilatura_extractor.py index fd2f48a..205c318 100644 --- a/webmainbench/extractors/trafilatura_extractor.py +++ b/webmainbench/extractors/trafilatura_extractor.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from .base import BaseExtractor, ExtractionResult from .factory import extractor -from trafilatura import extract,html2txt +from trafilatura import extract import re diff --git a/webmainbench/extractors/trafilatura_txt_extractor.py b/webmainbench/extractors/trafilatura_txt_extractor.py new file mode 100644 index 0000000..abdce9b --- /dev/null +++ b/webmainbench/extractors/trafilatura_txt_extractor.py @@ -0,0 +1,132 @@ + +""" +trafilatura extractor implementation. +""" +from typing import Dict, Any, Optional, List +from dataclasses import dataclass +from .base import BaseExtractor, ExtractionResult +from .factory import extractor +from trafilatura import extract,html2txt,baseline +import re + + +@dataclass +class TrafilaturaInferenceConfig: + """Configuration for Trafilatura extractor.""" + favor_precision: bool = True # 优先精度:只提取最核心的内容,过滤更多冗余(如侧边栏、广告),默认开启 + favor_recall: bool = True # 优先召回:尽可能提取所有潜在有效内容,减少遗漏,默认开启 + include_comments: bool = False # 是否保留评论,默认关闭 + include_tables: bool = True # 是否保留提取html表格,默认开启 + include_images: bool = False # 是否保留提取图片信息,默认开启 + include_links: bool = False # 是否保留链接,默认关闭 + with_metadata: bool = False # 是否保留元信息,默认关闭 + skip_elements: bool = False # 是否保留CSS隐藏元素,默认关闭 + output_format: str = "markdown" # 支持多种格式输出:"csv", "json", "html", "markdown", "txt", "xml"等 + + +@extractor("trafilatura_txt") +class TrafilaturaExtractor(BaseExtractor): + """Extractor using Trafilatura.""" + + version = "2.0.0" + description = "Trafilatura based content extractor" + + def __init__(self, name: str, config: Optional[Dict[str, Any]] = None): + super().__init__(name, config) + self.inference_config = TrafilaturaInferenceConfig() + + # 应用用户配置 + if config: + for key, value in config.items(): + if hasattr(self.inference_config, key): + setattr(self.inference_config, key, value) + + def _setup(self) -> None: + """Set up the Trafilatura extractor.""" + # 初始化操作 + pass + + def _extract_content(self, html: str, url: str = None) -> ExtractionResult: + """ + Extract content using Trafilatura. + + Args: + html: HTML content to extract from + url: Optional URL of the page + + Returns: + ExtractionResult instance + """ + try: + # 使用配置参数进行内容抽取 + # content = extract( + # html, + # url=url, + # favor_precision=self.inference_config.favor_precision, + # favor_recall=self.inference_config.favor_recall, + # include_comments=self.inference_config.include_comments, + # include_tables=self.inference_config.include_tables, + # include_images=self.inference_config.include_images, + # include_links=self.inference_config.include_links, + # with_metadata=self.inference_config.with_metadata, + # output_format=self.inference_config.output_format # 传入输出格式 + # + # ) + + # 使用最大化召回率进行内容抽取成txt + # content = html2txt(html) + + # 使用输出更准确的抽取txt结果 + postbody, content, len_text = baseline(html) + + # 创建 content_list(简单分割段落) + content_list = [] + if content: + paragraphs = content.split('\n\n') + for i, para in enumerate(paragraphs): + if para.strip(): + content_list.append({ + "type": "paragraph", + "content": para.strip(), + "index": i + }) + + return ExtractionResult( + content=content, + # content_list=content_list, + title=self._extract_title(html), + language=self._detect_language(content), + success=True + ) + + except Exception as e: + return ExtractionResult.create_error_result( + f"Trafilatura extraction failed: {str(e)}" + ) + + def _extract_title(self, html: str) -> Optional[str]: + """提取页面标题.""" + try: + import re + title_match = re.search(r'