Skip to content

Commit

Permalink
Merge pull request #325 from MortezaMahdaviMortazavi/master
Browse files Browse the repository at this point in the history
Add files via upload
  • Loading branch information
imani authored May 2, 2024
2 parents 1cb2f10 + 21a4001 commit 5f5a88c
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 211 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ Finally if you want to use our pretrained models, you can download it from the l
| [**Download DependencyParser**](https://drive.google.com/file/d/1MDapMSUXYfmQlu0etOAkgP5KDiWrNAV6/view?usp=share_link) | ~ 15 MB |
| [**Download Chunker**](https://drive.google.com/file/d/16hlAb_h7xdlxF4Ukhqk_fOV3g7rItVtk) | ~ 4 MB |
| [**Download spacy_pos_tagger_parsbertpostagger**](https://huggingface.co/roshan-research/spacy_pos_tagger_parsbertpostagger) | ~ 630 MB |
| [**Download spacy_pos_tagger_parsbertpostagger95**](https://huggingface.co/roshan-research/spacy_pos_tagger_parsbertpostagger95)| ~ 630 MB |
| [**Download spacy_pos_tagger_parsbertpostagger_Trained_on_95%**](https://huggingface.co/roshan-research/spacy_pos_tagger_parsbertpostagger95)| ~ 630 MB |
| [**Download spacy_chunker_uncased_bert**](https://huggingface.co/roshan-research/spacy_chunker_uncased_bert) | ~ 650 MB |
| [**Download spacy_chunker_parsbert**](https://huggingface.co/roshan-research/spacy_chunker_parsbert) | ~ 630 MB |
| [**Download spacy_dependency_parser**](https://huggingface.co/roshan-research/spacy_dependency_parser) | ~ 630 MB |
Expand Down Expand Up @@ -148,6 +148,16 @@ Finally if you want to use our pretrained models, you can download it from the l
>>> spacy_parser = SpacyDependencyParser(tagger=tagger, lemmatizer=lemmatizer)
>>> spacy_parser.parse_sents([word_tokenize('زنگ‌ها برای که به صدا درمی‌آید؟')])

>>> ner = HazmNER(model_path='ner/model-best')
>>> ner.predict_entity('حمله سایبری به سامانه سوخت در دولت سیزدهم برای بار دوم اتفاق افتاد، حادثه‌ای که در سال 1400 هم به وقوع پیوست اما رفع این مشکل بیش از یک هفته زمان برد، در حالی که آذر امسال پس از این حمله همه پمپ‌بنزین‌ها در کمتر از 24 ساعت فعالیت خود را از سر گرفتند.')
>>> ner.predict(
[
'ریو در ایران توسط شرکت سایپا از سال 1384 تا سال 1391 تولید شد',
'به جز ایالات متحده ، این خودرو در اروپا ، آمریکای جنوبی و آسیا هم فروش بالایی داشته است',
'این گاه شمار با قدمتی کمتر از دویست سال ، از جدیدترین گاه شمار های رایج به شمار می رود'
]
)

```

## Documentation
Expand Down
224 changes: 14 additions & 210 deletions hazm/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,157 +4,23 @@



def prepare_conll_data_format(
path: str,
sep: str = "\t",
verbose: bool = True,
) -> Tuple[List[List[str]], List[List[str]]]:
"""
Prepare data in CoNNL-like format.
Args:
- path (str): The path to the CoNNL-formatted file.
- sep (str): Separator used to split tokens and labels. Default is "\t".
- lower (bool): Flag indicating whether to convert tokens to lowercase. Default is True.
- verbose (bool): Flag indicating whether to display progress bar. Default is True.
Returns:
- Tuple[List[List[str]], List[List[str]]]: A tuple containing token sequences and label sequences.
"""
# Initialize lists to store token and label sequences
token_seq = []
label_seq = []

# Open the file and read line by line
with open(path, mode="r", encoding="utf-8") as fp:
tokens = []
labels = []

# Optionally display a progress bar
if verbose:
fp = tqdm(fp)

# Iterate through each line in the file
for line in fp:
# If the line is not empty
if line != "\n":
try:
# Split the line into token and label using the specified separator
token, label = line.strip().split(sep)
tokens.append(token)
labels.append(label)
except:
continue
else:
# If encounter an empty line, indicates the end of a sentence
if len(tokens) > 0:
token_seq.append(tokens)
label_seq.append(labels)
tokens = []
labels = []

return token_seq, label_seq


def convert_to_spacy_format(data):
"""
Convert data from CoNNL-like format to SpaCy format.
Args:
- data (List[Tuple[str, str]]): List of tuples containing token-label pairs.
Returns:
- Tuple[str, List[Tuple[int, int, str]]]: A tuple containing the processed text and entity annotations.
"""
# Initialize variables to store text and entities
text = ''
entities = []

# Iterate through each token-label pair
for word, label in data:
# If the label is 'O', append the word to the text
if label == 'O':
text += ' ' + word
else:
# If the label indicates an entity, update text and entities accordingly
text += ' ' + word
if text:
entities.append((len(text) - len(word) - 1, len(text) - 1, label))
else:
entities.append((0, len(word) - 1, label))

# Merge adjacent entities with the same label
if text:
return text.strip(), merge_tags(entities)
else:
return text, []

def merge_tags(tags):
"""
Merge adjacent entities with the same label.
Args:
- tags (List[Tuple[int, int, str]]): List of entity annotations.
Returns:
- List[Tuple[int, int, str]]: List of merged entity annotations.
"""
merged_tags = []
current_tag = None
start = None
end = None

for i, (start_idx, end_idx, tag) in enumerate(tags):
if tag.startswith('B-'):
if current_tag is not None:
merged_tags.append((start, end, current_tag))
current_tag = tag[2:]
start = start_idx
end = end_idx
elif tag.startswith('I-'):
if current_tag is not None and tag[2:] == current_tag:
end = end_idx
else: # tag == 'O'
if current_tag is not None:
merged_tags.append((start, end, current_tag))
current_tag = None

if current_tag is not None:
merged_tags.append((start, end, current_tag))

return merged_tags




class BaseNER(object):
def __init__(self,model_path):
"""
load_data: Load data from a file or any data source.
preprocess_data: Preprocess the loaded data, including tokenization, normalization, and any other necessary steps.
train_model: Train the NER model using the preprocessed data.
evaluate_model: Evaluate the trained model using appropriate metrics.
predict_entities: Predict named entities in new text using the trained model.
save_model: Save the trained NER model for future use.
load_model: Load a pre-trained NER model from disk.
class HazmNER:
from spacy.tokens import Doc
from spacy.tokens import DocBin
from spacy.vocab import Vocab

"""
pass



class HazmNER(BaseNER):
def __init__(self, model_path):
def __init__(self, model_path, use_gpu=False):
"""
Initialize the HazmNER object.
Parameters:
model_path (str): The path to the pre-trained NER model.
use_gpu (bool): Whether to use GPU for processing.
"""
super().__init__(model_path)
self.model_path = model_path
self.model = self.load_model(model_path)

self.use_gpu = use_gpu
self.model = self._load_model(model_path, use_gpu)

def predict_entities(self, sentences):
"""
Predict named entities in a list of sentences.
Expand Down Expand Up @@ -193,81 +59,19 @@ def evaluate_model(self, dataset_path):
dataset_path (str): Path to the evaluation dataset.
"""
subprocess.run(f"python -m spacy evaluate {self.model_path} {dataset_path}")


def _save_spacy_data(self, data, save_path):
"""
Save data in Spacy format.
Parameters:
data (list of tuple): Data to be saved in Spacy format.
save_path (str): Path to save the Spacy data.
"""
nlp = spacy.blank("fa")
db = DocBin()
for text, annotations in tqdm(data):
doc = nlp(text)
ents = []
if annotations:
for start, end, label in annotations:
span = doc.char_span(start, end, label=label)
ents.append(span)
else:
continue
doc.ents = ents
db.add(doc)
db.to_disk(save_path)

def _preprocess_data(self, data_path, save_path, sep, set_type='train'):
"""
Preprocess data for training or evaluation.
Parameters:
data_path (str): Path to the data file.
save_path (str): Path to save the preprocessed data.
sep (str): Separator used in the data file.
set_type (str): Type of data (train or val).
Raises:
AssertionError: If set_type is not 'train' or 'val'.
"""
assert set_type in ['train', 'val']
data = []
spacy_data = []
tokens, entities = prepare_conll_data_format(data_path, sep=sep, verbose=False)
for i in range(len(tokens)):
data.append(list(zip(tokens[i], entities[i])))

for sample in data:
spacy_data.append(convert_to_spacy_format(sample))

self._save_spacy_data(spacy_data, save_path + set_type + ".spacy")


def train_model(self, model_save_path, train_path, dev_path, data_save_path, sep):
"""
Train the NER model.
Parameters:
model_save_path (str): Path to save the trained model.
train_path (str): Path to the training data.
dev_path (str): Path to the validation data.
data_save_path (str): Path to save the preprocessed data.
sep (str): Separator used in the data files.
"""
self._preprocess_data(train_path, save_path=data_save_path, sep=sep)
self._preprocess_data(dev_path, save_patdata_0h=data_save_path, sep=sep)
subprocess.run(f"python -m spacy train config.cfg --output {model_save_path} --paths.train {train_path+'train.spacy'} --paths.dev {dev_path+'dev.spacy'}")
self.model = self._load_model(model_save_path)

def _load_model(self, model_path):
def _load_model(self, model_path, use_gpu):
"""
Load the trained NER model.
Parameters:
model_path (str): Path to the trained model.
use_gpu (bool): Whether to use GPU for processing.
Returns:
spacy.Language: Loaded NER model.
"""
import spacy
if use_gpu:
spacy.require_gpu()
return spacy.load(model_path)

0 comments on commit 5f5a88c

Please sign in to comment.