Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "names"]
path = names
url = git@github.com:aznlp/names.git
3 changes: 3 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .stemmer_extended import StemmerV2 as Stemmer
from .stemmer import Stemmer as StemmerV1

1 change: 1 addition & 0 deletions names
Submodule names added at 29abcf
14 changes: 14 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@

Stemmer for Azerbaijani language written in Python.

## Usage
```
root
└── stemmer
└── yourmain.py
└── ...
```
```python
from stemmer import Stemmer

stemmer = Stemmer()
lst = stemmer.stem_words(["nümunə",])
```

## Installation

The package will be published on PyPI after finishing the development of the first fully working version.
Expand Down
8 changes: 6 additions & 2 deletions stemmer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import os

PARENT_DIR = os.path.dirname(os.path.realpath(__file__))

# Stemmer class definition
class Stemmer:
# Stores the words loaded from the words.txt file
Expand All @@ -23,7 +27,7 @@ def __del__(self):
# Loads the words from the word.txt file into memory
def __load_words(self):
# Open words.txt file in read mode with utf-8 encoding.
with open("words.txt", "r", encoding="utf8") as words_file:
with open(os.path.join(PARENT_DIR,"words.txt"), "r", encoding="utf8") as words_file:
# Iterate over each line in the words.txt file
for word in words_file:
# Trim the spaces and newline characters from the string before adding to the list
Expand All @@ -32,7 +36,7 @@ def __load_words(self):
# Loads the suffixes from the suffix.txt file into memory
def __load_suffixes(self):
# Open suffix.txt file in read mode with utf-8 encoding
with open("suffix.txt", "r", encoding="utf8") as suffix_file:
with open(os.path.join(PARENT_DIR,"suffix.txt"), "r", encoding="utf8") as suffix_file:
# Iterate over each line in the suffix.txt file
for suffix in suffix_file:
# Trim the spaces and newline characters from the string before adding to the list
Expand Down
47 changes: 47 additions & 0 deletions stemmer_extended.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
'''
Subclasses the original Stemmer for additionaly functionality
'''
import os
from .stemmer import Stemmer

PARENT_DIR = os.path.dirname(os.path.realpath(__file__))

class StemmerV2(Stemmer):

def __init__(self):
super().__init__()
self.words = self.words.union(self.__load_names())

def __load_names(self):
names_list = set()
with open(os.path.join(PARENT_DIR,"names","male.txt"), "r", encoding="utf8") as names_male:
with open(os.path.join(PARENT_DIR,"names","female.txt"), "r", encoding="utf8") as names_female:
names_list = set(x.lower().strip() for x in names_male)
names_list.union(set(x.lower().strip() for x in names_female))
return names_list

def stem_words(self, list_of_phrases):

# handling multiple word phrases:
phrase_endings = []
phrase_starts = []
for phrase in list_of_phrases:
phrase_parts = phrase.split()
phrase_endings.append(phrase_parts[-1])
phrase_starts.append(" ".join(phrase_parts[:-1]))
list_of_stems = super().stem_words(phrase_endings)

assert len(list_of_stems) == len(phrase_endings)
retval = []
for start, end in zip(phrase_starts,list_of_stems):
if start:
retval.append(" ".join([start,end]))
else:
retval.append(end)

return retval