diff --git a/README.rst b/README.rst index 036da38..0495012 100644 --- a/README.rst +++ b/README.rst @@ -58,6 +58,9 @@ normalized_address_record() uses the included processing functions to remove una You may supply additional additional processing functions as a list of callable supplied to the addtl_funcs parameter. Any additional functions should take a string address and return a tuple of strings (line1, line2). +Postal codes are normalized to US zip or zip+4 and zero padded as applicable. ie: `2129 => 02129`, `02129-44 => 02129-0044`, `021290044 => 02129-0044`. +However, postal codes that cannot be effectively normalized, such as invalid length or invalid characters, will raise AddressValidationError. ie `12345678901 or 02129- or 02129-0044-123, etc` + Alternately, you may extend the `NormalizeAddress` class to customize the normalization behavior by overriding any of the class' methods. If your address is in the form of a dict that does not use the keys address_line_1, address_line_2, city, state, and postal_code, you must supply a key map to the addr_map parameter in the format {standard_key: custom_key} diff --git a/scourgify/normalize.py b/scourgify/normalize.py index 33e44d4..55470f4 100644 --- a/scourgify/normalize.py +++ b/scourgify/normalize.py @@ -182,7 +182,7 @@ def normalize_addr_str(addr_str, # type: str :type zipcode: str :param addtl_funcs: optional sequence of funcs that take string for further processing and return line1 and line2 strings - :type addtl_funcs: Sequence[Callable[str, (str, str)]] + :type addtl_funcs: Sequence[Callable[str, (str)]] :return: address dict with uppercase parsed and normalized address values. :rtype: Mapping[str, str] """ @@ -241,7 +241,7 @@ def normalize_addr_str(addr_str, # type: str # line1 is set to addr_str so complete dict can be passed to error. line1 = addr_str - addr_rec = dict( + addr_rec = OrderedDict( address_line_1=line1, address_line_2=line2, city=city, state=state, postal_code=zipcode ) @@ -417,7 +417,11 @@ def get_parsed_values(parsed_addr, orig_val, val_label, orig_addr_str): val_from_parse = post_clean_addr_str(val_from_parse) non_null_val_set = {orig_val, val_from_parse} - {None} if len(non_null_val_set) > 1: - raise AmbiguousAddressError(None, None, orig_addr_str) + msg = ( + f'Parsed {val_label} does not align with submitted value: ' + f'Parsed: {val_from_parse}. Original: {orig_val}' + ) + raise AmbiguousAddressError(None, msg, orig_addr_str) else: return non_null_val_set.pop() if non_null_val_set else None @@ -698,28 +702,32 @@ class NormalizeAddress(object): def __init__(self, address, addr_map=None, addtl_funcs=None, strict=None): self.address = address - self.addr_map = addr_map self.addtl_funcs = addtl_funcs self.strict = True if strict is None else strict + if addr_map and not isinstance(self.address, str): + self.address = { + key: self.address.get(val) for key, val in addr_map.items() + } + + @staticmethod + def get_normalized_line_1(parsed_addr, line_labels=LINE1_USADDRESS_LABELS): + return get_normalized_line_segment(parsed_addr, line_labels) + + @staticmethod + def get_normalized_line_2(parsed_addr, line_labels=LINE2_USADDRESS_LABELS): + return get_normalized_line_segment(parsed_addr, line_labels) def normalize(self): if isinstance(self.address, str): - return self.normalize_addr_str( - self.address, addtl_funcs=self.addtl_funcs - ) + return self.normalize_addr_str(self.address) else: - return self.normalize_addr_dict( - self.address, addr_map=self.addr_map, - addtl_funcs=self.addtl_funcs, strict=self.strict - ) + return self.normalize_addr_dict() def normalize_addr_str(self, addr_str, # type: str line2=None, # type: Optional[str] city=None, # type: Optional[str] state=None, # type: Optional[str] zipcode=None, # type: Optional[str] - addtl_funcs=None - # type: Sequence[Callable[[str,str], str]] # noqa ): # noqa # get address parsed into usaddress components. error = None @@ -774,20 +782,16 @@ def normalize_addr_str(self, addr_str, # type: str # assumes if line2 is passed in that it need not be parsed from # addr_str. Primarily used to allow advanced processing of # otherwise unparsable addresses. - line2 = line2 if line2 else get_normalized_line_segment( - parsed_addr, LINE2_USADDRESS_LABELS - ) + line2 = line2 if line2 else self.get_normalized_line_2(parsed_addr) line2 = self.post_clean_addr_str(line2) # line 1 is fully post cleaned in get_normalized_line_segment. - line1 = get_normalized_line_segment( - parsed_addr, LINE1_USADDRESS_LABELS - ) + line1 = self.get_normalized_line_1(parsed_addr) validate_parens_groups_parsed(line1) else: # line1 is set to addr_str so complete dict can be passed to error. line1 = addr_str - addr_rec = dict( + addr_rec = OrderedDict( address_line_1=line1, address_line_2=line2, city=city, state=state, postal_code=zipcode ) @@ -796,12 +800,10 @@ def normalize_addr_str(self, addr_str, # type: str else: return addr_rec - def normalize_addr_dict(self, addr_dict, addr_map=None, addtl_funcs=None, - strict=True): - if addr_map: - addr_dict = {key: addr_dict.get(val) for key, val in - addr_map.items()} - addr_dict = validate_address_components(addr_dict, strict=strict) + def normalize_addr_dict(self): + addr_dict = validate_address_components( + self.address, strict=self.strict + ) # line 1 and line 2 elements are combined to ensure consistent # processing whether the line 2 elements are pre-parsed or @@ -816,7 +818,7 @@ def normalize_addr_dict(self, addr_dict, addr_map=None, addtl_funcs=None, try: address = self.normalize_addr_str( addr_str, city=city, state=state, - zipcode=zipcode, addtl_funcs=addtl_funcs + zipcode=zipcode ) except AddressNormalizationError: addr_str = get_addr_line_str( @@ -824,6 +826,6 @@ def normalize_addr_dict(self, addr_dict, addr_map=None, addtl_funcs=None, ) address = self.normalize_addr_str( addr_str, city=city, state=state, - zipcode=zipcode, addtl_funcs=addtl_funcs + zipcode=zipcode ) return address diff --git a/scourgify/tests/test_address_normalization.py b/scourgify/tests/test_address_normalization.py index 45b2628..53dc50a 100644 --- a/scourgify/tests/test_address_normalization.py +++ b/scourgify/tests/test_address_normalization.py @@ -579,24 +579,44 @@ def test_validate_postal_code(self): """Test validate_us_postal_code_format""" with self.assertRaises(AddressValidationError): - zip_plus = '97219-0001-00' - validate_us_postal_code_format(zip_plus, self.address_dict) + zip_five = 'AAAAA' + validate_us_postal_code_format(zip_five, self.address_dict) + + with self.assertRaises(AddressValidationError): + zip_five = '97219-AAAA' + validate_us_postal_code_format(zip_five, self.address_dict) with self.assertRaises(AddressValidationError): - zip_plus = '97219-00' + zip_plus = '97219-000100' validate_us_postal_code_format(zip_plus, self.address_dict) with self.assertRaises(AddressValidationError): - zip_plus = '972-0001' + zip_plus = '97219-0001-00' validate_us_postal_code_format(zip_plus, self.address_dict) with self.assertRaises(AddressValidationError): zip_five = '9721900' validate_us_postal_code_format(zip_five, self.address_dict) - with self.assertRaises(AddressValidationError): - zip_five = '972' - validate_us_postal_code_format(zip_five, self.address_dict) + zip_five = '972' + expected = '00972' + result = validate_us_postal_code_format(zip_five, self.address_dict) + self.assertEqual(expected, result) + + zip_plus = '97219-00' + expected = '97219-0000' + result = validate_us_postal_code_format(zip_plus, self.address_dict) + self.assertEqual(expected, result) + + zip_plus = '972-0001' + expected = '00972-0001' + result = validate_us_postal_code_format(zip_plus, self.address_dict) + self.assertEqual(expected, result) + + zip_plus = '972190001' + expected = '97219-0001' + result = validate_us_postal_code_format(zip_plus, self.address_dict) + self.assertEqual(expected, result) expected = '97219' result = validate_us_postal_code_format(expected, self.address_dict) diff --git a/scourgify/validations.py b/scourgify/validations.py index f200ee3..e742ab6 100644 --- a/scourgify/validations.py +++ b/scourgify/validations.py @@ -97,14 +97,28 @@ def validate_us_postal_code_format(postal_code, address): 'US Postal Codes must conform to five-digit Zip or Zip+4 standards.' ) postal_code = post_clean_addr_str(postal_code) - if '-' in postal_code: - plus_four_code = postal_code.split('-') - if len(plus_four_code) != 2: + plus_four_code = postal_code.split('-') + for code in plus_four_code: + try: + int(code) + except ValueError: error = True - elif len(plus_four_code[0]) != 5 or len(plus_four_code[1]) != 4: + if not error: + if '-' in postal_code: + if len(postal_code.replace('-', '')) > 9: + error = True + elif len(plus_four_code) != 2: + error = True + else: + postal_code = '-'.join([ + plus_four_code[0].zfill(5), plus_four_code[1].zfill(4) + ]) + elif len(postal_code) == 9: + postal_code = '-'.join([postal_code[:5], postal_code[5:]]) + elif len(postal_code) > 5: error = True - elif len(postal_code) != 5: - error = True + else: + postal_code = postal_code.zfill(5) if error: raise AddressValidationError(msg, None, address) diff --git a/setup.cfg b/setup.cfg index cd4ebea..ab5ddd2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name=usaddress-scourgify -version=0.3.0 +version=0.4.0 description=Clean US addresses following USPS pub 28 and RESO guidelines author=Fable Turas author_email=fable@rainsoftware.tech