derek73 · patvdleer · Jan 31, 2022 · Jan 31, 2022 · Feb 3, 2022 · Jun 18, 2022
diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 The :py:mod:`nameparser.config` module manages the configuration of the
-nameparser. 
+nameparser.
 
 A module-level instance of :py:class:`~nameparser.config.Constants` is created
 and used by default for all HumanName instances. You can adjust the entire module's
@@ -25,11 +25,12 @@
     >>> hn.parse_full_name() # need to run this again after config changes
 
 **Potential Gotcha**: If you do not pass ``None`` as the second argument,
-``hn.C`` will be a reference to the module config, possibly yielding 
+``hn.C`` will be a reference to the module config, possibly yielding
 unexpected results. See `Customizing the Parser <customize.html>`_.
 """
 from __future__ import unicode_literals
 import sys
+
 try:
     # Python 3.3+
     from collections.abc import Set
@@ -46,6 +47,7 @@
 from nameparser.config.titles import TITLES
 from nameparser.config.titles import FIRST_NAME_TITLES
 from nameparser.config.regexes import REGEXES
+from nameparser.config.affixes import AFFIXES
 
 DEFAULT_ENCODING = 'UTF-8'
 
@@ -57,7 +59,7 @@ class SetManager(Set):
 
     Only special functionality beyond that provided by set() is
     to normalize constants for comparison (lower case, no periods)
-    when they are add()ed and remove()d and allow passing multiple 
+    when they are add()ed and remove()d and allow passing multiple
     string arguments to the :py:func:`add()` and :py:func:`remove()` methods.
 
     '''
@@ -125,7 +127,7 @@ def remove(self, *strings):
 
 class TupleManager(dict):
     '''
-    A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants 
+    A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants
     more friendly.
     '''
 
@@ -148,23 +150,25 @@ class Constants(object):
     """
     An instance of this class hold all of the configuration constants for the parser.
 
-    :param set prefixes: 
+    :param set prefixes:
+        :py:attr:`prefixes` wrapped with :py:class:`SetManager`.
+    :param set family prefixes:
         :py:attr:`prefixes` wrapped with :py:class:`SetManager`.
-    :param set titles: 
+    :param set titles:
         :py:attr:`titles` wrapped with :py:class:`SetManager`.
-    :param set first_name_titles: 
+    :param set first_name_titles:
         :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`.
-    :param set suffix_acronyms: 
+    :param set suffix_acronyms:
         :py:attr:`~suffixes.SUFFIX_ACRONYMS`  wrapped with :py:class:`SetManager`.
-    :param set suffix_not_acronyms: 
+    :param set suffix_not_acronyms:
         :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS`  wrapped with :py:class:`SetManager`.
-    :param set conjunctions: 
+    :param set conjunctions:
         :py:attr:`conjunctions`  wrapped with :py:class:`SetManager`.
     :type capitalization_exceptions: tuple or dict
-    :param capitalization_exceptions: 
+    :param capitalization_exceptions:
         :py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`.
     :type regexes: tuple or dict
-    :param regexes: 
+    :param regexes:
         :py:attr:`regexes`  wrapped with :py:class:`TupleManager`.
     """
 
@@ -187,17 +191,17 @@ class Constants(object):
     empty_attribute_default = ''
     """
     Default return value for empty attributes.
-    
+
     .. doctest::
-    
+
         >>> from nameparser.config import CONSTANTS
         >>> CONSTANTS.empty_attribute_default = None
         >>> name = HumanName("John Doe")
         >>> name.title
         None
         >>>name.first
         'John'
-        
+
     """
 
     capitalize_name = False
@@ -233,6 +237,7 @@ class Constants(object):
 
     def __init__(self,
                  prefixes=PREFIXES,
+                 family_affixes=AFFIXES,
                  suffix_acronyms=SUFFIX_ACRONYMS,
                  suffix_not_acronyms=SUFFIX_NOT_ACRONYMS,
                  titles=TITLES,
@@ -242,6 +247,7 @@ def __init__(self,
                  regexes=REGEXES
                  ):
         self.prefixes = SetManager(prefixes)
+        self.family_affixes = SetManager(family_affixes)
         self.suffix_acronyms = SetManager(suffix_acronyms)
         self.suffix_not_acronyms = SetManager(suffix_not_acronyms)
         self.titles = SetManager(titles)

diff --git a/nameparser/config/affixes.py b/nameparser/config/affixes.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+# https://en.wikipedia.org/wiki/List_of_family_name_affixes
+
+AFFIXES = set([
+    'a',
+    'ab',
+    'af',
+    'av',
+    'ap',
+    'abu',
+    'ait',
+    'aït',
+    'alam',
+    'at',
+    'ath',
+    'aust',
+    'austre',
+    'bar',
+    'bat',
+    'bath',
+    'ben',
+    'bin',
+    'ibn',
+    'bert',
+    'bet',
+    'bint',
+    'da',
+    'das',
+    'de',
+    'degli',
+    'del',
+    'dele',
+    'della',
+    'den',
+    'der',
+    'di',
+    'dos',
+    'du',
+    'e',
+    'el',
+    'fetch',
+    'vetch',
+    'fitz',
+    'i',
+    'kil',
+    'gil',
+    'la',
+    'le',
+    'lille',
+    'lu',
+    'm\'',
+    'mc',
+    'mac',
+    'mck',
+    'mhic',
+    'mic',
+    'mala',
+    'mellom',
+    'myljom',
+    'na',
+    'ned',
+    'nedre',
+    'neder',
+    'nic',
+    'ni',
+    'ní',
+    'nin',
+    'nord',
+    'norr',
+    'nord',
+    'nordre',
+    'ny',
+    'o',
+    'ua',
+    'ua',
+    'ui',
+    'uí',
+    'opp',
+    'upp',
+    'ofver',
+    'ost',
+    'oster',
+    'over',
+    'ovste',
+    'ovre',
+    'oz',
+    'pour',
+    'putra',
+    'putera',
+    'putri',
+    'putera',
+    'setia',
+    'setya',
+    'stor',
+    'soder',
+    'sor',
+    'sonder',
+    'syd',
+    'sondre',
+    'syndre',
+    'sore',
+    'ter',
+    '\'t',
+    'tre',
+    'van',
+    'het',
+    'de',
+    'vast',
+    'väst',
+    'vaster',
+    'väster',
+    'verch',
+    'erch',
+    'vest',
+    'vestre',
+    'vesle',
+    'vetle',
+    'von',
+    'war',
+    'zu',
+])
diff --git a/nameparser/parser.py b/nameparser/parser.py
@@ -47,6 +47,8 @@ class HumanName(object):
     * :py:attr:`suffix`
     * :py:attr:`nickname`
     * :py:attr:`surnames`
+    * :py:attr:`family`
+    * :py:attr:`family_prefix`
 
     :param str full_name: The name string to be parsed.
     :param constants constants:
@@ -300,6 +302,16 @@ def last(self):
         """
         return " ".join(self.last_list) or self.C.empty_attribute_default
 
+    @property
+    def family(self):
+        """
+        The person's family name.
+        """
+        s = ""
+        for affix, family in self.family_list:
+            s += " ".join([*affix, *family]) or self.C.empty_attribute_default
+        return s
+
     @property
     def suffix(self):
         """
@@ -399,6 +411,19 @@ def is_prefix(self, piece):
         else:
             return lc(piece) in self.C.prefixes
 
+    def is_family_affix(self, piece):
+        """
+        Lowercase and no periods version of piece is in the
+        :py:data:`~nameparser.config.family_affixes.AFFIXES` set.
+        """
+        if isinstance(piece, list):
+            for item in piece:
+                if self.is_family_affix(item):
+                    return True
+        else:
+            return lc(piece) in self.C.family_affixes
+
+
     def is_roman_numeral(self, value):
         """
         Matches the ``roman_numeral`` regular expression in
@@ -513,9 +538,9 @@ def parse_nicknames(self):
         Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`;
         `quoted_word`, `double_quotes` and `parenthesis`.
         """
-        
+
         empty_re = re.compile("")
-        
+
         re_quoted_word = self.C.regexes.quoted_word or empty_re
         re_double_quotes = self.C.regexes.double_quotes or empty_re
         re_parenthesis = self.C.regexes.parenthesis or empty_re
@@ -563,6 +588,7 @@ def parse_full_name(self):
         self.first_list = []
         self.middle_list = []
         self.last_list = []
+        self.family_list = []
         self.suffix_list = []
         self.nickname_list = []
         self.unparsable = True
@@ -699,6 +725,19 @@ def parse_full_name(self):
                 except IndexError:
                     pass
 
+        for last in self.last_list:
+            if " " in last:
+                affix = []
+                family = []
+                for part in last.split(" "):
+                    if self.is_family_affix(part):
+                        affix.append(part)
+                    else:
+                        family.append(part)
+                self.family_list.append([affix, family])
+            else:
+                self.family_list.append([[], [last]])
+
         if len(self) < 0:
             log.info("Unparsable: \"%s\" ", self.original)
         else:
@@ -968,6 +1007,7 @@ def capitalize(self, force=None):
         self.first_list = self.cap_piece(self.first, 'first').split(' ')
         self.middle_list = self.cap_piece(self.middle, 'middle').split(' ')
         self.last_list = self.cap_piece(self.last, 'last').split(' ')
+        # self.family_list = self.cap_piece(self.family, 'family').split(' ')
         self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ')
 
     def handle_capitalization(self):

diff --git a/tests.py b/tests.py
@@ -187,6 +187,20 @@ def test_prefix_names(self):
         self.m(hn.first, "vai", hn)
         self.m(hn.last, "la", hn)
 
+    def test_family_name_and_prefix(self):
+        hn = HumanName("Vincent van Gogh")
+        self.m(hn.family, "van Gogh", hn)
+        self.assertEqual(hn.family_list, [
+            [["van"], ["Gogh"]]
+        ])
+
+    def test_family_name_and_double_prefix(self):
+        hn = HumanName("Vincent van der Gogh")
+        self.m(hn.family, "van der Gogh", hn)
+        self.assertEqual(hn.family_list, [
+            [["van", "der"], ["Gogh"]],
 # join everything after the prefix until the next prefix or suffix 
 try: 
     if i == 0 and total_length >= 1: 
         # If it's the first piece and there are more than 1 rootnames, assume it's a first name 
         continue 
     next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) 
     j = pieces.index(next_prefix) 
     if j == i + 1: 
         # if there are two prefixes in sequence, join to the following piece 
         j += 1 
     new_piece = ' '.join(pieces[i:j]) 
     pieces = pieces[:i] + [new_piece] + pieces[j:] 
 except StopIteration: 
     try: 
         # if there are no more prefixes, look for a suffix to stop at 
         stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:]))) 
         j = pieces.index(stop_at) 
         new_piece = ' '.join(pieces[i:j]) 
         pieces = pieces[:i] + [new_piece] + pieces[j:] 
     except StopIteration: 
         # if there were no suffixes, nothing to stop at so join all 
         # remaining pieces 
         new_piece = ' '.join(pieces[i:]) 
         pieces = pieces[:i] + [new_piece] 
 # join everything after the prefix until the next prefix or suffix 
  
 try: 
     if i == 0 and total_length >= 1: 
         # If it's the first piece and there are more than 1 rootnames, assume it's a first name 
         continue 
     next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) 
     j = pieces.index(next_prefix) 
     if j == i + 1: 
         # if there are two prefixes in sequence, join to the following piece 
         j += 1 
     new_piece = ' '.join(pieces[i:j]) 
     pieces = pieces[:i] + [new_piece] + pieces[j:] 
 except StopIteration: 
     try: 
         # if there are no more prefixes, look for a suffix to stop at 
         stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:]))) 
         j = pieces.index(stop_at) 
         new_piece = ' '.join(pieces[i:j]) 
         pieces = pieces[:i] + [new_piece] + pieces[j:] 
     except StopIteration: 
         # if there were no suffixes, nothing to stop at so join all 
         # remaining pieces 
         new_piece = ' '.join(pieces[i:]) 
         pieces = pieces[:i] + [new_piece] 
+        ])
+
     def test_blank_name(self):
         hn = HumanName()
         self.m(hn.first, "", hn)