diff --git a/price_parser/parser.py b/price_parser/parser.py index 364ab2c..7d4e375 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import itertools import re import string from typing import Callable, Optional, Pattern, List, Tuple @@ -56,8 +57,20 @@ def fromstring(cls, price: Optional[str], def or_regex(symbols: List[str]) -> Pattern: - """ Return a regex which matches any of ``symbols`` """ - return re.compile('|'.join(re.escape(s) for s in symbols)) + """ Return a regex which matches any of ``symbols`` surrounded by some special characters """ + left_tokens = [r"^", r"\s+?", r"\d+?"] + right_tokens = [r"$", r"\s+?", r"\d+?", r"[^a-zA-Z0-9]+?"] + + return re.compile( + "|".join( + [ + left + "({})".format(re.escape(s)) + right + for left, right in itertools.product(left_tokens, right_tokens) + for s in symbols + ] + ) + ) + # If one of these symbols is found either in price or in currency, @@ -96,16 +109,21 @@ def or_regex(symbols: List[str]) -> Pattern: DOLLAR_CODES = [k for k in CURRENCY_CODES if k.endswith('D')] _DOLLAR_REGEX = re.compile( r''' - \b + (\b (?:{}) # currency code like NZD (?= \$? # dollar sign to ignore if attached to the currency code (?:[\W\d]|$) # not a letter - ) + )) '''.format('|'.join(re.escape(k) for k in DOLLAR_CODES)), re.VERBOSE, ) +OTHER_PARTICULAR_REGEXES = [ + # HT is the French abbreviation for "Hors Tax" (tax not added to the price) + # and it may appear after € currency symbol + r"(€)HT+?", +] # Other common currency symbols: 3-letter codes, less safe abbreviations OTHER_CURRENCY_SYMBOLS_SET = ( @@ -115,7 +133,7 @@ def or_regex(symbols: List[str]) -> Pattern: CURRENCY_NATIONAL_SYMBOLS + # even if they appear in text, currency is likely to be rouble - ['р', 'Р'] + ['р', 'Р'], ) - set(SAFE_CURRENCY_SYMBOLS) # already handled - {'-', 'XXX'} # placeholder values @@ -125,6 +143,7 @@ def or_regex(symbols: List[str]) -> Pattern: key=len, reverse=True) _search_dollar_code = _DOLLAR_REGEX.search +_search_other_particular_regexes = re.compile("|".join(OTHER_PARTICULAR_REGEXES), re.VERBOSE).search _search_safe_currency = or_regex(SAFE_CURRENCY_SYMBOLS).search _search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search @@ -140,6 +159,7 @@ def extract_currency_symbol(price: Optional[str], (_search_safe_currency, currency_hint), (_search_unsafe_currency, price), (_search_unsafe_currency, currency_hint), + (_search_other_particular_regexes, price), ] if currency_hint and '$' in currency_hint: @@ -151,7 +171,10 @@ def extract_currency_symbol(price: Optional[str], for meth, attr in methods: m = meth(attr) if attr else None if m: - return m.group(0) + groups = [match for match in m.groups() if match is not None] + assert groups + + return groups.pop() return None diff --git a/tests/test_price_parsing.py b/tests/test_price_parsing.py index 28ee0a3..ad690ae 100644 --- a/tests/test_price_parsing.py +++ b/tests/test_price_parsing.py @@ -86,7 +86,21 @@ def __eq__(self, other): Example(None, '12,000원', '원', '12,000', 12000), Example(None, '3,500円', - '円', '3,500', 3500) + '円', '3,500', 3500), + Example(None, 'EUROPE', + None, None, None), + Example(None, 'NEUROLOGY PRICES', + None, None, None), + Example(None, 'Prices in EUR', + 'EUR', None, None), + Example(None, 'Prices in EUR for all products', + 'EUR', None, None), + Example(None, 'EUR is the selected currency', + 'EUR', None, None), + Example(None, ' Prices in EUR ', + 'EUR', None, None), + Example(None, '13800 ₶ ', + '₶', '13800', 13800) ] @@ -1919,6 +1933,10 @@ def __eq__(self, other): 'CHF', '19.90', 19.90), Example('', '530,42 Zł', 'Zł', '530,42', 530.42), + Example('>', 'См. цену в прайсе', + None, None, None), + Example('Купить', 'Печная труба', + None, None, None), ] @@ -1941,12 +1959,6 @@ def __eq__(self, other): Example('Cuneo', '61.858 L', # Romanian New Leu 'L', '61.858', 61858), - # "р" / "руб" is detected as currency - Example('>', 'См. цену в прайсе', - None, None, None), - Example('Купить', 'Печная труба', - None, None, None), - # dates Example(None, 'July, 2004', None, None, None),