|
1 | 1 | """Tests adapted from the spacy_stanza repo"""
|
2 | 2 |
|
3 | 3 | from spacy.lang.en import EnglishDefaults, English
|
4 |
| -from spacy.lang.de import GermanDefaults |
5 |
| -from spacy.lang.es import SpanishDefaults |
| 4 | + |
6 | 5 |
|
7 | 6 | import stanza
|
8 | 7 | import pytest
|
9 | 8 |
|
10 |
| -from presidio_analyzer.nlp_engine.stanza_nlp_engine import ( |
11 |
| - load_pipeline, |
12 |
| - StanzaTokenizer, |
13 |
| -) |
| 9 | +from presidio_analyzer.nlp_engine.stanza_nlp_engine import load_pipeline |
14 | 10 |
|
15 | 11 |
|
16 | 12 | def tags_equal(act, exp):
|
17 | 13 | """Check if each actual tag in act is equal to one or more expected tags in exp."""
|
18 | 14 | return all(a == e if isinstance(e, str) else a in e for a, e in zip(act, exp))
|
19 | 15 |
|
20 | 16 |
|
21 |
| -@pytest.mark.skip_engine("stanza_en") |
22 |
| -def test_spacy_stanza_english(): |
| 17 | +@pytest.fixture(scope="module") |
| 18 | +def stanza_pipeline(): |
23 | 19 | lang = "en"
|
24 | 20 | stanza.download(lang)
|
25 | 21 | nlp = load_pipeline(lang)
|
26 |
| - assert nlp.Defaults == EnglishDefaults |
| 22 | + return nlp |
27 | 23 |
|
| 24 | +@pytest.mark.skip_engine("stanza_en") |
| 25 | +def test_spacy_stanza_english(stanza_pipeline): |
| 26 | + nlp = stanza_pipeline |
| 27 | + assert nlp.Defaults == EnglishDefaults |
| 28 | + lang = "en" |
28 | 29 | doc = nlp("Hello world! This is a test.")
|
29 | 30 |
|
30 | 31 | # Expected POS tags. Note: Different versions of stanza result in different
|
@@ -96,149 +97,6 @@ def test_spacy_stanza_english():
|
96 | 97 | assert doc.ents[1].text == "Hawaii"
|
97 | 98 | assert doc.ents[1].label_ == "GPE"
|
98 | 99 |
|
99 |
| - # Test trailing whitespace handling |
100 |
| - doc = nlp("a ") |
101 |
| - doc = nlp("a ") |
102 |
| - doc = nlp("a \n") |
103 |
| - doc = nlp("\n ") |
104 |
| - doc = nlp("\t ") |
105 |
| - doc = nlp("a\n ") |
106 |
| - doc = nlp("a \t ") |
107 |
| - |
108 | 100 | # Test serialization
|
109 | 101 | reloaded_nlp = load_pipeline(lang).from_bytes(nlp.to_bytes())
|
110 | 102 | assert reloaded_nlp.config.to_str() == nlp.config.to_str()
|
111 |
| - |
112 |
| - |
113 |
| -@pytest.mark.skip_engine("stanza_en") |
114 |
| -def test_spacy_stanza_german(): |
115 |
| - lang = "de" |
116 |
| - stanza.download(lang) |
117 |
| - nlp = load_pipeline(lang) |
118 |
| - assert nlp.Defaults == GermanDefaults |
119 |
| - |
120 |
| - # warning for misaligned ents due to multi-word token expansion |
121 |
| - with pytest.warns(UserWarning): |
122 |
| - doc = nlp("Auf dem Friedhof an der Straße Am Rosengarten") |
123 |
| - |
124 |
| - |
125 |
| -@pytest.mark.skip_engine("stanza_en") |
126 |
| -def test_spacy_stanza_spanish(): |
127 |
| - lang = "es" |
128 |
| - stanza.download(lang) |
129 |
| - nlp = load_pipeline(lang) |
130 |
| - snlp = nlp.tokenizer.snlp |
131 |
| - assert nlp.Defaults == SpanishDefaults |
132 |
| - |
133 |
| - # Example from the training data so that predicted labels are likely correct |
134 |
| - # https://github.com/UniversalDependencies/UD_Spanish-AnCora |
135 |
| - text = "Las reservas en oro se valoran en base a 300 dólares estadounidenses por cada onza troy de oro." |
136 |
| - doc = nlp(text) |
137 |
| - sdoc = snlp(text) |
138 |
| - |
139 |
| - # In the training data (UD<v2.9), the xpos columns are empty, meaning that |
140 |
| - # xpos = None in stanza. In this case, the pos (upos) should be copied to tag (xpos) |
141 |
| - # UDv2.9 does have xpos tags. So to make sure this test runs successfully, only |
142 |
| - # run it when we know that the original stanza xpos is None (UD<v2.9) |
143 |
| - if all(w.xpos is None for sent in sdoc.sentences for w in sent.words): |
144 |
| - assert ( |
145 |
| - [t.pos_ for t in doc] |
146 |
| - == [t.tag_ for t in doc] |
147 |
| - == [ |
148 |
| - "DET", |
149 |
| - "NOUN", |
150 |
| - "ADP", |
151 |
| - "NOUN", |
152 |
| - "PRON", |
153 |
| - "VERB", |
154 |
| - "ADP", |
155 |
| - "NOUN", |
156 |
| - "ADP", |
157 |
| - "NUM", |
158 |
| - "NOUN", |
159 |
| - "ADJ", |
160 |
| - "ADP", |
161 |
| - "DET", |
162 |
| - "NOUN", |
163 |
| - "NOUN", |
164 |
| - "ADP", |
165 |
| - "NOUN", |
166 |
| - "PUNCT", |
167 |
| - ] |
168 |
| - ) |
169 |
| - else: |
170 |
| - pass |
171 |
| - |
172 |
| - |
173 |
| -@pytest.mark.skip_engine("stanza_en") |
174 |
| -def test_spacy_stanza_tokenizer_options(): |
175 |
| - # whitespace tokens from spacy tokenizer are handled correctly |
176 |
| - lang = "en" |
177 |
| - stanza.download(lang) |
178 |
| - nlp = load_pipeline(lang, processors={"tokenize": "spacy"}) |
179 |
| - |
180 |
| - doc = nlp(" Barack Obama was born\n\nin Hawaii.") |
181 |
| - assert [t.text for t in doc] == [ |
182 |
| - " ", |
183 |
| - "Barack", |
184 |
| - " ", |
185 |
| - "Obama", |
186 |
| - " ", |
187 |
| - "was", |
188 |
| - " ", |
189 |
| - "born", |
190 |
| - "\n\n", |
191 |
| - "in", |
192 |
| - "Hawaii", |
193 |
| - ".", |
194 |
| - ] |
195 |
| - |
196 |
| - # pretokenized text is handled correctly |
197 |
| - nlp = load_pipeline(lang, tokenize_pretokenized=True) |
198 |
| - doc = nlp("Barack Obama was born in Hawaii.\nBarack Obama was born in Hawaii.") |
199 |
| - assert [t.text for t in doc] == [ |
200 |
| - "Barack", |
201 |
| - "Obama", |
202 |
| - "was", |
203 |
| - "born", |
204 |
| - "in", |
205 |
| - "Hawaii.", |
206 |
| - "Barack", |
207 |
| - "Obama", |
208 |
| - "was", |
209 |
| - "born", |
210 |
| - "in", |
211 |
| - "Hawaii.", |
212 |
| - ] |
213 |
| - doc = nlp( |
214 |
| - " Barack Obama was born\n\n in Hawaii.\nBarack Obama was born in Hawaii." |
215 |
| - ) |
216 |
| - assert [t.text for t in doc] == [ |
217 |
| - "Barack", |
218 |
| - "Obama", |
219 |
| - "was", |
220 |
| - "born", |
221 |
| - "in", |
222 |
| - "Hawaii.", |
223 |
| - "Barack", |
224 |
| - "Obama", |
225 |
| - "was", |
226 |
| - "born", |
227 |
| - "in", |
228 |
| - "Hawaii.", |
229 |
| - ] |
230 |
| - |
231 |
| - |
232 |
| -@pytest.mark.skip_engine("stanza_en") |
233 |
| -def test_spacy_stanza_from_config(): |
234 |
| - config = { |
235 |
| - "nlp": { |
236 |
| - "tokenizer": { |
237 |
| - "@tokenizers": "PipelineAsTokenizer.v1", |
238 |
| - "lang": "en", |
239 |
| - } |
240 |
| - } |
241 |
| - } |
242 |
| - nlp = English.from_config(config) |
243 |
| - assert nlp.Defaults == EnglishDefaults |
244 |
| - assert type(nlp.tokenizer) == StanzaTokenizer |
0 commit comments