From 0df7944858a65180f9393f64ed2f8f896c43cf3a Mon Sep 17 00:00:00 2001 From: rpalsaxena Date: Mon, 2 Mar 2020 19:16:03 +0530 Subject: [PATCH 1/3] Update: Handle million/billion in the price --- price_parser/parser.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/price_parser/parser.py b/price_parser/parser.py index 364ab2c..96ece66 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -3,12 +3,10 @@ import string from typing import Callable, Optional, Pattern, List, Tuple from decimal import Decimal, InvalidOperation - import attr from ._currencies import (CURRENCY_CODES, CURRENCY_NATIONAL_SYMBOLS, CURRENCY_SYMBOLS) - @attr.s(auto_attribs=True) class Price: amount: Optional[Decimal] # price numeric value, as Decimal @@ -51,10 +49,8 @@ def fromstring(cls, price: Optional[str], amount_text=amount_text, ) - parse_price = Price.fromstring - def or_regex(symbols: List[str]) -> Pattern: """ Return a regex which matches any of ``symbols`` """ return re.compile('|'.join(re.escape(s) for s in symbols)) @@ -201,13 +197,21 @@ def extract_price_text(price: str) -> Optional[str]: m = re.search(r""" (\d[\d\s.,]*) # number, probably with thousand separators \s*? # skip whitespace + ([m|M,b|B]il\w*)? # check million* or billion* (?:[^%\d]|$) # capture next symbol - it shouldn't be % - """, price, re.VERBOSE) + """, price, re.VERBOSE) if m: - return m.group(1).strip(',.').strip() + price = m.group(0).strip(',.').strip().lower() + if 'bil' in price: + price = price.split(' ')[0] + ' x 10\u2079' + elif 'mil' in price: + price = price.split(' ')[0] + ' x 10\u2076' + + return price if 'free' in price.lower(): return '0' + return None @@ -240,7 +244,7 @@ def get_decimal_separator(price: str) -> Optional[str]: """ m = _search_decimal_sep(price) if m: - return m.group(1) + return m.group(0) def parse_number(num: str, @@ -294,6 +298,7 @@ def parse_number(num: str, assert decimal_separator == '€' num = num.replace('.', '').replace(',', '').replace('€', '.') try: - return Decimal(num) + #return Decimal(num) + return num except InvalidOperation: - return None + return None \ No newline at end of file From 4edd2249011fe6f7f2d44c0cc0e97800c5724b6f Mon Sep 17 00:00:00 2001 From: rpalsaxena Date: Mon, 2 Mar 2020 19:16:38 +0530 Subject: [PATCH 2/3] Update: Handle million/billion in the price --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7c4140e..6382ad3 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ name='price-parser', version='0.3.1', description='Extract price and currency from a raw string', - long_description=open('README.rst').read() + "\n\n" + open('CHANGES.rst').read(), + long_description=open('README.rst', encoding="utf8").read() + "\n\n" + open('CHANGES.rst').read(), author='Mikhail Korobov', author_email='kmike84@gmail.com', url='https://github.com/scrapinghub/price-parser', From d882cccbb89e9b4b9f80d103a3ec4de5c19930c9 Mon Sep 17 00:00:00 2001 From: rpalsaxena Date: Thu, 5 Mar 2020 17:09:41 +0530 Subject: [PATCH 3/3] Update: Resolve issue[1] --- price_parser/parser.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/price_parser/parser.py b/price_parser/parser.py index 96ece66..e03dc95 100644 --- a/price_parser/parser.py +++ b/price_parser/parser.py @@ -3,10 +3,12 @@ import string from typing import Callable, Optional, Pattern, List, Tuple from decimal import Decimal, InvalidOperation + import attr from ._currencies import (CURRENCY_CODES, CURRENCY_NATIONAL_SYMBOLS, CURRENCY_SYMBOLS) + @attr.s(auto_attribs=True) class Price: amount: Optional[Decimal] # price numeric value, as Decimal @@ -29,7 +31,6 @@ def fromstring(cls, price: Optional[str], Given price and currency text extracted from HTML elements, return ``Price`` instance, which provides a clean currency symbol and price amount as a Decimal number. - ``currency_hint`` is optional; you can pass value of some element which may contain currency, as a hint. If currency is present in ``price`` string, it could be **preferred** over a value extracted @@ -49,8 +50,10 @@ def fromstring(cls, price: Optional[str], amount_text=amount_text, ) + parse_price = Price.fromstring + def or_regex(symbols: List[str]) -> Pattern: """ Return a regex which matches any of ``symbols`` """ return re.compile('|'.join(re.escape(s) for s in symbols)) @@ -158,7 +161,6 @@ def extract_price_text(price: str) -> Optional[str]: maybe some other text. If multiple price-looking substrings are present, the first is returned (FIXME: it is better to return a number which is near a currency symbol). - >>> extract_price_text("price: $12.99") '12.99' >>> extract_price_text("Free") @@ -166,10 +168,8 @@ def extract_price_text(price: str) -> Optional[str]: >>> extract_price_text("Foo") >>> extract_price_text("1,235 USD") '1,235' - In addition to numbers, it has a limited support for a case where currency symbol (currently only euro) is a decimal separator: - >>> extract_price_text("99 €, 79 €") '99' >>> extract_price_text("99 € 79 €") @@ -199,19 +199,18 @@ def extract_price_text(price: str) -> Optional[str]: \s*? # skip whitespace ([m|M,b|B]il\w*)? # check million* or billion* (?:[^%\d]|$) # capture next symbol - it shouldn't be % - """, price, re.VERBOSE) + """, price, re.VERBOSE) if m: - price = m.group(0).strip(',.').strip().lower() - if 'bil' in price: - price = price.split(' ')[0] + ' x 10\u2079' - elif 'mil' in price: - price = price.split(' ')[0] + ' x 10\u2076' - + subprice = m.group(2) + if 'bil' in subprice: + price = m.group(1).strip(',.').strip()+' x 10\u2079' + elif 'mil' in subprice: + price = m.group(1).strip(',.').strip()+' x 10\u2076' return price + if 'free' in price.lower(): return '0' - return None @@ -230,7 +229,6 @@ def extract_price_text(price: str) -> Optional[str]: def get_decimal_separator(price: str) -> Optional[str]: """ Return decimal separator symbol or None if there is no decimal separator. - >>> get_decimal_separator("1000") >>> get_decimal_separator("12.99") '.' @@ -244,14 +242,13 @@ def get_decimal_separator(price: str) -> Optional[str]: """ m = _search_decimal_sep(price) if m: - return m.group(0) + return m.group(1) def parse_number(num: str, decimal_separator: Optional[str] = None) -> Optional[Decimal]: """ Parse a string with a number to a Decimal, guessing its format: decimal separator, thousand separator. Return None if parsing fails. - >>> parse_number("1,234") Decimal('1234') >>> parse_number("12,34") @@ -285,7 +282,12 @@ def parse_number(num: str, """ if not num: return None - num = num.strip().replace(' ', '') + num_copy = num + if 'x' in num: + num = num.split()[0] + else: + num = num.strip().replace(' ', '') + decimal_separator = decimal_separator or get_decimal_separator(num) # NOTE: Keep supported separators in sync with _search_decimal_sep if decimal_separator is None: @@ -298,7 +300,7 @@ def parse_number(num: str, assert decimal_separator == '€' num = num.replace('.', '').replace(',', '').replace('€', '.') try: - #return Decimal(num) + num = num+num_copy.split()[1]+num_copy.split()[2] if 'x' in num_copy else '' return num except InvalidOperation: return None \ No newline at end of file