Skip to content

Commit 15e31d5

Browse files
authored
Merge pull request #44 from pekopoke/dev
add extractor version in results fix formula match in table and dollar
2 parents 4cf8b09 + 27f2639 commit 15e31d5

File tree

4 files changed

+41
-13
lines changed

4 files changed

+41
-13
lines changed

tests/test_formula_extraction.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,5 +167,19 @@ def test_formula_within_table(self):
167167
self.assertIn('| 公式类型 | 示例 |', result['table'])
168168

169169

170+
def test_dollar_within_table(self):
171+
"""测试表格中的转义$包裹的内容不要被提取"""
172+
173+
text = """
174+
<table><tbody><tr><td><table><tbody><tr><td><table><tbody><tr><td><strong>Better Management of /$800 Bln Forex Reserves Urged</strong></td></tr></tbody></table></td></tr><tr><td><p>A number of political advisors on Sunday called for more rationally managing China's massive foreign exchange reserves, which doubled over the 2004-05 period to an equivalent of US /$818.9 billion, second only to Japan.</p><p>The quick buildup is largely a result of China's booming exports and foreign exchange controls by the government, as well as speculation on the yuan's rise, industry watchers agree.</p><p>A big part of China's foreign exchange reserves are US dollar-denominated assets, including bonds issued by the US government. "Risks in the international foreign exchange market should be lowered when China manages its reserves," said Professor Guo Guoqing of a business school of the People's University of China.</p><p>Guo, a member of the National Committee of the Chinese People's Political Consultative Conference (CPPCC), the country's top advisory body, urged the government to cut back on subsidies for exports and take other measures to reduce foreign trade surpluses appropriately and achieve the balance in international payments.</p><p>Part of the reserves should be channeled into the imports of more high-tech machinery, equipment and other products, he suggested on the sidelines of the CPPCC's annual session.</p><p>The United States has been contending that the value of yuan, also known as renminbi or RMB, is too low, giving Chinese exporters an "unfair" advantage. But China said its huge trade surpluses are also a result of the US reluctance to export goods involving state-of-the-art technologies.</p><p>Fu Rui, also a CPPCC member, said with ample foreign exchange reserves, China could intentionally bulk up the reserves of strategic resources.</p><p>The international consensus is a country's rational foreign exchange reserves should equal to its imports demand for a full quarter. Also taking into consideration of payments for foreign debts, returns for foreign investors and other demands in China, many believe it is enough for the country to retain US/$300 billion.</p><p>But Lin Yifu, a popular economist, underscored China's per capita foreign exchange reserves remains not large - less than one-tenth of Japan's and far below that of Hong Kong and Singapore.</p><p>The reserves were "tremendous fruits" from China's reform and opening-up drive, he said.</p><p>His remarks were echoed by Xiao Zhuoji, a well-known economics professor with Beijing University. "The rise of foreign exchange reserves reflects China's fast, sustained economic growth and sound international payments," he said.</p><p>"The reserves are of significant importance to upgrade the China image in the international economic arena, strengthen the nation's macro-control capabilities and guard against financial risks," added Xiao, a Standing Committee member of the CPPCC National Committee.</p><p>But as the People's Bank of China, or the central bank, has to buy foreign exchange reserves under the current foreign exchange control policies, the country's monetary base will be enlarged, increasing its inflationary pressure and difficulties on macro-economic controls, analysts acknowledge.</p><p>Another prevailing view is that China's hefty foreign exchange reserves actually "occupied" large amounts of fund resources that otherwise can be diverted for domestic investment and consumption.</p><p>Some CPPCC members said they believe it is already "meaningless" now to talk about whether China's foreign exchange reserves size is big or not. "The key lies on how to raise the reserves' yields."</p><p>"If the annual yields from foreign exchange reserves could reach a stable 5 percent, the nation will reap in 300 billion yuan a year. What a big fortune!" one advisor told Xinhua.</p><p>Central banker Zhou Xiaochuan reiterated earlier that China will "pay attention to and maintain the flexibility" of foreign reserves structure, which is unknown to the public.</p></td></tr></tbody></table></td></tr></tbody></table>
175+
"""
176+
177+
result = self.metric._extract_from_markdown(text)
178+
179+
# 验证表格中的转义$包裹的内容不要被提取
180+
self.assertNotIn('800', result['formula'])
181+
182+
183+
170184
if __name__ == '__main__':
171185
unittest.main()

webmainbench/data/saver.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -126,32 +126,45 @@ def save_summary_report(results: Union["EvaluationResult", List["EvaluationResul
126126
file_path: Output CSV file path
127127
"""
128128
import csv
129-
129+
from importlib import metadata as importlib_metadata
130+
130131
file_path = Path(file_path)
131132
file_path.parent.mkdir(parents=True, exist_ok=True)
132-
133-
# Convert EvaluationResult objects to dicts and ensure we have a list
133+
134+
# 转换结果为字典列表
134135
def to_dict_if_needed(item):
135136
return item.to_dict() if hasattr(item, 'to_dict') else item
136-
137+
137138
if isinstance(results, list):
138139
results_list = [to_dict_if_needed(item) for item in results]
139140
else:
140141
results_list = [to_dict_if_needed(results)]
141-
142-
# Prepare CSV data
142+
143143
csv_data = []
144-
145144
for result in results_list:
146-
# Extract basic info
147145
metadata = result.get('metadata', {})
148146
error_analysis = result.get('error_analysis', {})
149-
147+
148+
# 获取抽取器版本
149+
extractor_name = metadata.get('extractor_name', 'unknown')
150+
try:
151+
# 映射抽取器名称到包名
152+
package_mapping = {
153+
'llm-webkit': 'llm_web_kit',
154+
'magic-html': 'magic_html',
155+
'trafilatura': 'trafilatura',
156+
'resiliparse': 'resiliparse'
157+
}
158+
package_name = package_mapping.get(extractor_name, extractor_name)
159+
extractor_version = importlib_metadata.version(package_name)
160+
except importlib_metadata.PackageNotFoundError:
161+
extractor_version = 'unknown'
150162
row = {
151163
'extractor': metadata.get('extractor_name', 'unknown'),
152164
'dataset': metadata.get('dataset_name', 'unknown'),
153165
'total_samples': metadata.get('total_samples', 0),
154-
'success_rate': error_analysis.get('success_rate', 0.0)
166+
'success_rate': error_analysis.get('success_rate', 0.0),
167+
'extractor_version': extractor_version,
155168
}
156169

157170
# Add all available metrics from overall_metrics
@@ -170,7 +183,7 @@ def get_sort_key(row):
170183
# Write CSV file
171184
if csv_data:
172185
# Define field order: basic info first, then overall, then other metrics alphabetically
173-
basic_fields = ['extractor', 'dataset', 'total_samples', 'success_rate']
186+
basic_fields = ['extractor','extractor_version', 'dataset', 'total_samples', 'success_rate']
174187

175188
# Get all metric fields from the data
176189
all_fields = set()

webmainbench/extractors/llm_webkit_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def process_logit(self, prompt_token_ids: List[int], input_ids: List[int], logit
155155
class LlmWebkitExtractor(BaseExtractor):
156156
"""Advanced LLM-WebKit extractor with intelligent content classification."""
157157

158-
version = "2.0.0"
158+
version = "4.0.1"
159159
description = "Advanced LLM-WebKit extractor with intelligent content classification"
160160

161161
# 分类提示模板

webmainbench/metrics/base.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,8 @@ def _extract_from_markdown(text: str) -> Dict[str, str]:
307307
# r'(?<!\\)\$([^$\n\w][^$\n]*[^$\n\w])\$(?![\\\$])',
308308
r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', # 行间 $$...$$,确保 $ 没有被转义
309309
r'(?<!\\)\\\[(.*?)(?<!\\)\\\]', # 行间 \[...\],确保 \ 没有被转义
310-
r'(?<!\\)\$(.*?)(?<!\\)\$', # 行内 $...$,确保 $ 没有被转义
310+
# r'(?<!\\)\$(.*?)(?<!\\)\$', # 行内 $...$,确保 $ 没有被转义
311+
r'(?<!\\)\$(.*?)(?<!\\)\$(?!\d)', # 第二个$后面不能是数字
311312
r'(?<!\\)\\\((.*?)(?<!\\)\\\)', # 行内 \(...\),确保 \ 没有被转义
312313
]
313314

0 commit comments

Comments
 (0)