aznlp · ahadsuleymanli · Sep 14, 2020 · Sep 14, 2020 · Sep 27, 2020
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "names"]
+	path = names
+	url = git@github.com:aznlp/names.git
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1,3 @@
+from .stemmer_extended import StemmerV2 as Stemmer
+from .stemmer import Stemmer as StemmerV1
+
diff --git a/names b/names
diff --git a/readme.md b/readme.md
@@ -2,6 +2,20 @@
 
 Stemmer for Azerbaijani language written in Python.
 
+## Usage
+```
+root  
+└── stemmer  
+└── yourmain.py  
+└── ...
+```  
+```python
+from stemmer import Stemmer
+
+stemmer = Stemmer()
+lst = stemmer.stem_words(["nümunə",])
+```
+
 ## Installation
 
 The package will be published on PyPI after finishing the development of the first fully working version. 

diff --git a/stemmer.py b/stemmer.py
@@ -1,3 +1,7 @@
+import os
+
+PARENT_DIR = os.path.dirname(os.path.realpath(__file__))
+
 # Stemmer class definition
 class Stemmer:
     # Stores the words loaded from the words.txt file
@@ -23,7 +27,7 @@ def __del__(self):
     # Loads the words from the word.txt file into memory
     def __load_words(self):
         # Open words.txt file in read mode with utf-8 encoding.
-        with open("words.txt", "r", encoding="utf8") as words_file:
+        with open(os.path.join(PARENT_DIR,"words.txt"), "r", encoding="utf8") as words_file:
             # Iterate over each line in the words.txt file
             for word in words_file:
                 # Trim the spaces and newline characters from the string before adding to the list
@@ -32,7 +36,7 @@ def __load_words(self):
     # Loads the suffixes from the suffix.txt file into memory
     def __load_suffixes(self):
         # Open suffix.txt file in read mode with utf-8 encoding
-        with open("suffix.txt", "r", encoding="utf8") as suffix_file:
+        with open(os.path.join(PARENT_DIR,"suffix.txt"), "r", encoding="utf8") as suffix_file:
             # Iterate over each line in the suffix.txt file
             for suffix in suffix_file:
                 # Trim the spaces and newline characters from the string before adding to the list

diff --git a/stemmer_extended.py b/stemmer_extended.py
@@ -0,0 +1,47 @@
+'''
+    Subclasses the original Stemmer for additionaly functionality
+'''
+import os
+from .stemmer import Stemmer
+
+PARENT_DIR = os.path.dirname(os.path.realpath(__file__))
+
+class StemmerV2(Stemmer):
+
+    def __init__(self):
+        super().__init__()
+        self.words = self.words.union(self.__load_names())
+
+    def __load_names(self):
+        names_list = set()
+        with open(os.path.join(PARENT_DIR,"names","male.txt"), "r", encoding="utf8") as names_male:
+            with open(os.path.join(PARENT_DIR,"names","female.txt"), "r", encoding="utf8") as names_female:
+                names_list = set(x.lower().strip() for x in names_male)
+                names_list.union(set(x.lower().strip() for x in names_female))
+        return names_list
+
+    def stem_words(self, list_of_phrases):
+
+        # handling multiple word phrases:
+        phrase_endings = []
+        phrase_starts = []
+        for phrase in list_of_phrases:
+            phrase_parts = phrase.split()
+            phrase_endings.append(phrase_parts[-1])
+            phrase_starts.append(" ".join(phrase_parts[:-1]))
+        list_of_stems = super().stem_words(phrase_endings)
+
+        assert len(list_of_stems) == len(phrase_endings)
+        retval = []
+        for start, end in zip(phrase_starts,list_of_stems):
+            if start:
+                retval.append(" ".join([start,end]))
+            else:
+                retval.append(end)
+
+        return retval
+
+
+
+
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .stemmer_extended import StemmerV2 as Stemmer
		from .stemmer import Stemmer as StemmerV1