diff --git a/price_parser/parser.py b/price_parser/parser.py index 364ab2c..e03dc95 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -31,7 +31,6 @@ def fromstring(cls, price: Optional[str], Given price and currency text extracted from HTML elements, return ``Price`` instance, which provides a clean currency symbol and price amount as a Decimal number. - ``currency_hint`` is optional; you can pass value of some element which may contain currency, as a hint. If currency is present in ``price`` string, it could be **preferred** over a value extracted @@ -162,7 +161,6 @@ def extract_price_text(price: str) -> Optional[str]: maybe some other text. If multiple price-looking substrings are present, the first is returned (FIXME: it is better to return a number which is near a currency symbol). - >>> extract_price_text("price: $12.99") '12.99' >>> extract_price_text("Free") @@ -170,10 +168,8 @@ def extract_price_text(price: str) -> Optional[str]: >>> extract_price_text("Foo") >>> extract_price_text("1,235 USD") '1,235' - In addition to numbers, it has a limited support for a case where currency symbol (currently only euro) is a decimal separator: - >>> extract_price_text("99 €, 79 €") '99' >>> extract_price_text("99 € 79 €") @@ -201,11 +197,18 @@ def extract_price_text(price: str) -> Optional[str]: m = re.search(r""" (\d[\d\s.,]*) # number, probably with thousand separators \s*? # skip whitespace + ([m|M,b|B]il\w*)? # check million* or billion* (?:[^%\d]|$) # capture next symbol - it shouldn't be % """, price, re.VERBOSE) if m: - return m.group(1).strip(',.').strip() + subprice = m.group(2) + if 'bil' in subprice: + price = m.group(1).strip(',.').strip()+' x 10\u2079' + elif 'mil' in subprice: + price = m.group(1).strip(',.').strip()+' x 10\u2076' + return price + if 'free' in price.lower(): return '0' return None @@ -226,7 +229,6 @@ def extract_price_text(price: str) -> Optional[str]: def get_decimal_separator(price: str) -> Optional[str]: """ Return decimal separator symbol or None if there is no decimal separator. - >>> get_decimal_separator("1000") >>> get_decimal_separator("12.99") '.' @@ -247,7 +249,6 @@ def parse_number(num: str, decimal_separator: Optional[str] = None) -> Optional[Decimal]: """ Parse a string with a number to a Decimal, guessing its format: decimal separator, thousand separator. Return None if parsing fails. - >>> parse_number("1,234") Decimal('1234') >>> parse_number("12,34") @@ -281,7 +282,12 @@ def parse_number(num: str, """ if not num: return None - num = num.strip().replace(' ', '') + num_copy = num + if 'x' in num: + num = num.split()[0] + else: + num = num.strip().replace(' ', '') + decimal_separator = decimal_separator or get_decimal_separator(num) # NOTE: Keep supported separators in sync with _search_decimal_sep if decimal_separator is None: @@ -294,6 +300,7 @@ def parse_number(num: str, assert decimal_separator == '€' num = num.replace('.', '').replace(',', '').replace('€', '.') try: - return Decimal(num) + num = num+num_copy.split()[1]+num_copy.split()[2] if 'x' in num_copy else '' + return num except InvalidOperation: - return None + return None \ No newline at end of file diff --git a/setup.py b/setup.py index 7c4140e..6382ad3 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ name='price-parser', version='0.3.1', description='Extract price and currency from a raw string', - long_description=open('README.rst').read() + "\n\n" + open('CHANGES.rst').read(), + long_description=open('README.rst', encoding="utf8").read() + "\n\n" + open('CHANGES.rst').read(), author='Mikhail Korobov', author_email='kmike84@gmail.com', url='https://github.com/scrapinghub/price-parser',