Merge pull request #21 from INGEOTEC/develop

mgraffg · web-flow · commit b7449bd91822 · 2024-12-18T11:56:44.000-06:00
EncExp
diff --git a/encexp/__init__.py b/encexp/__init__.py
@@ -15,6 +15,6 @@
 import sys
 
 if not '-m' in sys.argv:
-    from encexp.text_repr import EncExp
+    from encexp.text_repr import EncExp, EncExpT, SeqTM, TM
 
-__version__ = "0.0.18"
+__version__ = "0.0.19"
diff --git a/encexp/download.py b/encexp/download.py
@@ -22,6 +22,7 @@
 
 def download_seqtm(lang, voc_size_exponent: int=13,
                    output=None, voc_source='noGeo',
+                   prefix='seqtm',
                    prefix_suffix: bool=True):
     """Download SeqTM vocabulary"""
     if not isdir(MODELS):
@@ -32,7 +33,7 @@ def download_seqtm(lang, voc_size_exponent: int=13,
     for flag in [voc_source]:
         if flag is not None:
             flags.append(flag)
-    voc_fname = f'seqtm_{"_".join(flags)}_{lang}_{voc_size_exponent}.json.gz'
+    voc_fname = f'{prefix}_{"_".join(flags)}_{lang}_{voc_size_exponent}.json.gz'
     if output is None:
         output = join(MODELS, voc_fname)
     if isfile(output):
diff --git a/encexp/tests/test_text_repr.py b/encexp/tests/test_text_repr.py
@@ -19,10 +19,16 @@
 from encexp.tests.test_utils import samples
 from encexp.utils import compute_b4msa_vocabulary, compute_seqtm_vocabulary
 from encexp.build_encexp import build_encexp
-from encexp.text_repr import SeqTM, EncExp
+from encexp.text_repr import SeqTM, EncExp, TM, EncExpT
 from sklearn.base import clone
 
 
+def test_tm():
+    """Test TM"""
+    tm = TM(voc_source='mix')
+    _ = tm['buenos dias mxeico']
+    assert len(_) == 13
+
 def test_seqtm():
     """Test SeqTM"""
     
@@ -377,9 +383,73 @@ def test_EncExp_build_tailored():
     enc = EncExp(lang='es',
                  tailored=True)
     w = enc.weights
-    enc.build_tailored(mx + ar)    
+    enc.build_tailored(mx + ar, load=True)    
     assert isfile(enc.tailored)
+    assert hasattr(enc, '_tailored_built')
     enc = EncExp(lang='es',
                  tailored=enc.tailored).fit(mx + ar, y)
     assert np.fabs(w - enc.weights).sum() != 0
+    enc2 = clone(enc)
+    assert hasattr(enc2, '_tailored_built')
+    assert hasattr(enc2, '_estimator')
     # os.unlink(enc.tailored)
+
+def test_pipeline_tm():
+    """Test Pipeline"""
+    samples()
+    mx = list(tweet_iterator('es-mx-sample.json'))
+    samples(filename='es-ar-sample.json.zip')
+    ar = list(tweet_iterator('es-ar-sample.json'))
+    y = ['mx'] * len(mx)
+    y += ['ar'] * len(ar)
+
+    from sklearn.pipeline import Pipeline
+    from sklearn.svm import LinearSVC
+    from sklearn.model_selection import GridSearchCV
+    from sklearn.model_selection import StratifiedShuffleSplit
+
+    pipe = Pipeline([('bow', 'passthrough'),
+                    ('cl', LinearSVC(class_weight='balanced'))])
+    params = {'cl__C': [0.01, 0.1, 1, 10],
+              'bow': [SeqTM(lang='es', voc_source='mix'),
+                      TM(lang='es', voc_source='mix')]}
+    sss = StratifiedShuffleSplit(random_state=0,
+                                 n_splits=1,
+                                 test_size=0.3)
+
+    grid = GridSearchCV(pipe,
+                        param_grid=params,
+                        cv=sss,
+                        n_jobs=-1,
+                        scoring='f1_macro').fit(mx + ar, y)
+    assert grid.best_score_ > 0.7
+
+
+def test_pipeline_encexp():
+    """Test Pipeline in EncExpT"""
+    from sklearn.pipeline import Pipeline
+    from sklearn.svm import LinearSVC
+    from sklearn.model_selection import GridSearchCV
+    from sklearn.model_selection import StratifiedShuffleSplit
+
+    samples()
+    mx = list(tweet_iterator('es-mx-sample.json'))
+    samples(filename='es-ar-sample.json.zip')
+    ar = list(tweet_iterator('es-ar-sample.json'))
+    y = ['mx'] * len(mx)
+    y += ['ar'] * len(ar)
+
+    pipe = Pipeline([('encexp', EncExpT(lang='es')),
+                     ('cl', LinearSVC(class_weight='balanced'))])
+    params = {'cl__C': [0.01, 0.1, 1, 10],
+              'encexp__voc_source': ['mix', 'noGeo']}
+    sss = StratifiedShuffleSplit(random_state=0,
+                                n_splits=1,
+                                test_size=0.3)
+
+    grid = GridSearchCV(pipe,
+                        param_grid=params,
+                        cv=sss,
+                        n_jobs=1,
+                        scoring='f1_macro').fit(mx + ar, y)
+    assert grid.best_score_ > 0.7
diff --git a/encexp/text_repr.py b/encexp/text_repr.py