Skip to content

Commit 28ef0a9

Browse files
committed
simplified stanza tests to reduce memory
1 parent 11600a3 commit 28ef0a9

File tree

1 file changed

+10
-152
lines changed

1 file changed

+10
-152
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,31 @@
11
"""Tests adapted from the spacy_stanza repo"""
22

33
from spacy.lang.en import EnglishDefaults, English
4-
from spacy.lang.de import GermanDefaults
5-
from spacy.lang.es import SpanishDefaults
4+
65

76
import stanza
87
import pytest
98

10-
from presidio_analyzer.nlp_engine.stanza_nlp_engine import (
11-
load_pipeline,
12-
StanzaTokenizer,
13-
)
9+
from presidio_analyzer.nlp_engine.stanza_nlp_engine import load_pipeline
1410

1511

1612
def tags_equal(act, exp):
1713
"""Check if each actual tag in act is equal to one or more expected tags in exp."""
1814
return all(a == e if isinstance(e, str) else a in e for a, e in zip(act, exp))
1915

2016

21-
@pytest.mark.skip_engine("stanza_en")
22-
def test_spacy_stanza_english():
17+
@pytest.fixture(scope="module")
18+
def stanza_pipeline():
2319
lang = "en"
2420
stanza.download(lang)
2521
nlp = load_pipeline(lang)
26-
assert nlp.Defaults == EnglishDefaults
22+
return nlp
2723

24+
@pytest.mark.skip_engine("stanza_en")
25+
def test_spacy_stanza_english(stanza_pipeline):
26+
nlp = stanza_pipeline
27+
assert nlp.Defaults == EnglishDefaults
28+
lang = "en"
2829
doc = nlp("Hello world! This is a test.")
2930

3031
# Expected POS tags. Note: Different versions of stanza result in different
@@ -96,149 +97,6 @@ def test_spacy_stanza_english():
9697
assert doc.ents[1].text == "Hawaii"
9798
assert doc.ents[1].label_ == "GPE"
9899

99-
# Test trailing whitespace handling
100-
doc = nlp("a ")
101-
doc = nlp("a ")
102-
doc = nlp("a \n")
103-
doc = nlp("\n ")
104-
doc = nlp("\t ")
105-
doc = nlp("a\n ")
106-
doc = nlp("a \t ")
107-
108100
# Test serialization
109101
reloaded_nlp = load_pipeline(lang).from_bytes(nlp.to_bytes())
110102
assert reloaded_nlp.config.to_str() == nlp.config.to_str()
111-
112-
113-
@pytest.mark.skip_engine("stanza_en")
114-
def test_spacy_stanza_german():
115-
lang = "de"
116-
stanza.download(lang)
117-
nlp = load_pipeline(lang)
118-
assert nlp.Defaults == GermanDefaults
119-
120-
# warning for misaligned ents due to multi-word token expansion
121-
with pytest.warns(UserWarning):
122-
doc = nlp("Auf dem Friedhof an der Straße Am Rosengarten")
123-
124-
125-
@pytest.mark.skip_engine("stanza_en")
126-
def test_spacy_stanza_spanish():
127-
lang = "es"
128-
stanza.download(lang)
129-
nlp = load_pipeline(lang)
130-
snlp = nlp.tokenizer.snlp
131-
assert nlp.Defaults == SpanishDefaults
132-
133-
# Example from the training data so that predicted labels are likely correct
134-
# https://github.com/UniversalDependencies/UD_Spanish-AnCora
135-
text = "Las reservas en oro se valoran en base a 300 dólares estadounidenses por cada onza troy de oro."
136-
doc = nlp(text)
137-
sdoc = snlp(text)
138-
139-
# In the training data (UD<v2.9), the xpos columns are empty, meaning that
140-
# xpos = None in stanza. In this case, the pos (upos) should be copied to tag (xpos)
141-
# UDv2.9 does have xpos tags. So to make sure this test runs successfully, only
142-
# run it when we know that the original stanza xpos is None (UD<v2.9)
143-
if all(w.xpos is None for sent in sdoc.sentences for w in sent.words):
144-
assert (
145-
[t.pos_ for t in doc]
146-
== [t.tag_ for t in doc]
147-
== [
148-
"DET",
149-
"NOUN",
150-
"ADP",
151-
"NOUN",
152-
"PRON",
153-
"VERB",
154-
"ADP",
155-
"NOUN",
156-
"ADP",
157-
"NUM",
158-
"NOUN",
159-
"ADJ",
160-
"ADP",
161-
"DET",
162-
"NOUN",
163-
"NOUN",
164-
"ADP",
165-
"NOUN",
166-
"PUNCT",
167-
]
168-
)
169-
else:
170-
pass
171-
172-
173-
@pytest.mark.skip_engine("stanza_en")
174-
def test_spacy_stanza_tokenizer_options():
175-
# whitespace tokens from spacy tokenizer are handled correctly
176-
lang = "en"
177-
stanza.download(lang)
178-
nlp = load_pipeline(lang, processors={"tokenize": "spacy"})
179-
180-
doc = nlp(" Barack Obama was born\n\nin Hawaii.")
181-
assert [t.text for t in doc] == [
182-
" ",
183-
"Barack",
184-
" ",
185-
"Obama",
186-
" ",
187-
"was",
188-
" ",
189-
"born",
190-
"\n\n",
191-
"in",
192-
"Hawaii",
193-
".",
194-
]
195-
196-
# pretokenized text is handled correctly
197-
nlp = load_pipeline(lang, tokenize_pretokenized=True)
198-
doc = nlp("Barack Obama was born in Hawaii.\nBarack Obama was born in Hawaii.")
199-
assert [t.text for t in doc] == [
200-
"Barack",
201-
"Obama",
202-
"was",
203-
"born",
204-
"in",
205-
"Hawaii.",
206-
"Barack",
207-
"Obama",
208-
"was",
209-
"born",
210-
"in",
211-
"Hawaii.",
212-
]
213-
doc = nlp(
214-
" Barack Obama was born\n\n in Hawaii.\nBarack Obama was born in Hawaii."
215-
)
216-
assert [t.text for t in doc] == [
217-
"Barack",
218-
"Obama",
219-
"was",
220-
"born",
221-
"in",
222-
"Hawaii.",
223-
"Barack",
224-
"Obama",
225-
"was",
226-
"born",
227-
"in",
228-
"Hawaii.",
229-
]
230-
231-
232-
@pytest.mark.skip_engine("stanza_en")
233-
def test_spacy_stanza_from_config():
234-
config = {
235-
"nlp": {
236-
"tokenizer": {
237-
"@tokenizers": "PipelineAsTokenizer.v1",
238-
"lang": "en",
239-
}
240-
}
241-
}
242-
nlp = English.from_config(config)
243-
assert nlp.Defaults == EnglishDefaults
244-
assert type(nlp.tokenizer) == StanzaTokenizer

0 commit comments

Comments
 (0)