INGEOTEC · mgraffg · Jul 23, 2025 · Jul 23, 2025
diff --git a/encexp/__init__.py b/encexp/__init__.py
@@ -17,4 +17,4 @@
 if not '-m' in sys.argv:
     from encexp.text_repr import EncExpT, SeqTM, TextModel
 
-__version__ = "0.1.6"
+__version__ = "0.1.7"
diff --git a/encexp/tests/test_text_repr.py b/encexp/tests/test_text_repr.py
@@ -235,309 +235,9 @@ def test_TextModel_diac():
     assert len(lst) > 3
 
 
-
-# def test_EncExp_filename():
-#     """Test EncExp"""
-#     if not isfile('encexp-es-mx.json.gz'):
-#         samples()
-#         data = compute_b4msa_vocabulary('es-mx-sample.json')
-#         voc = compute_seqtm_vocabulary(SeqTM, data,
-#                                        'es-mx-sample.json',
-#                                        voc_size_exponent=10)
-#         build_encexp(voc, 'es-mx-sample.json', 'encexp-es-mx.json.gz')
-#     enc = EncExp(EncExp_filename='encexp-es-mx.json.gz')
-#     assert enc.weights.dtype == np.float32
-#     assert len(enc.names) == 12
-#     os.unlink('encexp-es-mx.json.gz')
-
-
-# def test_EncExp():
-#     """Test EncExp"""
-#     enc = EncExp(precision=np.float16)
-#     assert enc.weights.dtype == np.float16
-#     assert len(enc.names) == 8192
-
-
-# def test_EncExp_encode():
-#     """Test EncExp encode"""
-
-#     dense = EncExp(precision=np.float16)
-#     assert dense.encode('buenos días').shape[1] == 2
-
-
-# def test_EncExp_transform():
-#     """Test EncExp transform"""
-
-#     encexp = EncExp()
-#     X = encexp.transform(['buenos dias'])
-#     assert X.shape[0] == 1
-#     assert X.shape[1] == 8192
-#     assert X.dtype == np.float32
-
-
-# def test_EncExp_prefix_suffix():
-#     """Test EncExp prefix/suffix"""
-
-#     encexp = EncExp(lang='es',
-#                     precision=np.float16,
-#                     prefix_suffix=True)
-#     for k in encexp.bow.names:
-#         if k[:2] != 'q:':
-#             continue
-#         if len(k) >= 6:
-#             continue
-#         assert k[3] == '~' or k[-1] == '~'
-
-
-# def test_EncExp_fit():
-#     """Test EncExp fit"""
-#     from sklearn.svm import LinearSVC
-#     samples()
-#     mx = list(tweet_iterator('es-mx-sample.json'))
-#     samples(filename='es-ar-sample.json.zip')
-#     ar = list(tweet_iterator('es-ar-sample.json'))
-#     y = ['mx'] * len(mx)
-#     y += ['ar'] * len(ar)
-#     enc = EncExp(lang='es',
-#                  prefix_suffix=True,
-#                  precision=np.float16).fit(mx + ar, y)
-#     assert isinstance(enc.estimator, LinearSVC)
-#     hy = enc.predict(ar)
-#     assert hy.shape[0] == len(ar)
-#     df = enc.decision_function(ar)
-#     assert df.shape[0] == len(ar)
-#     assert df.dtype == np.float64
-
-
-# def test_EncExp_fit_sgd():
-#     """Test EncExp fit"""
-#     from sklearn.linear_model import SGDClassifier
-#     from itertools import repeat
-#     samples()
-#     mx = list(tweet_iterator('es-mx-sample.json'))
-#     samples(filename='es-ar-sample.json.zip')
-#     ar = list(tweet_iterator('es-ar-sample.json'))
-#     y = ['mx'] * len(mx)
-#     y += ['ar'] * len(ar)
-#     D = mx + ar
-#     # while len(D) < 2**17:
-#     for i in range(5):
-#         D.extend(D)
-#         y.extend(y)
-#     D.append(D[0])
-#     y.append(y[0])
-#     enc = EncExp(lang='es').fit(D, y)
-#     assert isinstance(enc.estimator, SGDClassifier)
-#     hy = enc.predict(ar)
-#     assert hy.shape[0] == len(ar)
-#     df = enc.decision_function(ar)
-#     assert df.shape[0] == len(ar)
-#     assert df.dtype == np.float64    
-
-
-# def test_EncExp_train_predict_decision_function():
-#     """Test EncExp train_predict_decision_function"""
-#     samples()
-#     mx = list(tweet_iterator('es-mx-sample.json'))
-#     samples(filename='es-ar-sample.json.zip')
-#     ar = list(tweet_iterator('es-ar-sample.json'))
-#     samples(filename='es-es-sample.json.zip')
-#     es = list(tweet_iterator('es-es-sample.json'))
-#     y = ['mx'] * len(mx)
-#     y += ['ar'] * len(ar)
-#     enc = EncExp(lang='es',
-#                  prefix_suffix=True,
-#                  precision=np.float16)
-#     hy = enc.train_predict_decision_function(mx + ar, y)
-#     assert hy.ndim == 2 and hy.shape[0] == len(y) and hy.shape[1] == 1
-#     y += ['es'] * len(es)
-#     hy = enc.train_predict_decision_function(mx + ar + es, y)
-#     assert hy.shape[1] == 3 and hy.shape[0] == len(y)
-
-
-# def test_EncExp_clone():
-#     """Test EncExp clone"""
-
-#     enc = EncExp(lang='es', prefix_suffix=True,
-#                  precision=np.float16)
-#     enc2 = clone(enc)
-#     assert isinstance(enc2, EncExp)
-#     assert np.all(enc2.weights == enc.weights)
-
-
-# def test_EncExp_merge_IDF():
-#     """Test EncExp without keyword's weight"""
-
-#     enc = EncExp(lang='es', prefix_suffix=True,
-#                  precision=np.float16, merge_IDF=False,
-#                  force_token=False)
-#     enc.fill(inplace=True)
-
-#     for k, v in enc.bow.token2id.items():
-#         assert enc.weights[v, v] == 0
-#     enc2 = EncExp(lang='es', prefix_suffix=True,
-#                   precision=np.float16, merge_IDF=True,
-#                   force_token=False)
-#     enc2.fill(inplace=True)
-#     _ = (enc.weights * enc.bow.weights).astype(enc.precision)
-#     assert_almost_equal(_, enc2.weights, decimal=5)
-
-
-# def test_EncExp_fill():
-#     """Test EncExp fill weights"""
-#     from encexp.download import download_seqtm
-
-#     voc = download_seqtm(lang='es')
-#     samples()
-#     if not isfile('encexp-es-mx.json.gz'):
-#         build_encexp(voc, 'es-mx-sample.json', 'encexp-es-mx.json.gz',
-#                      min_pos=64)
-#     enc = EncExp(EncExp_filename='encexp-es-mx.json.gz')
-#     iden = {v:k for k, v in enumerate(enc.bow.names)}
-#     comp = [x for x in enc.bow.names if x not in enc.names]
-#     key = enc.names[0]
-#     enc.weights
-#     w = enc.fill()
-#     assert np.any(w[iden[key]] != 0)
-#     assert_almost_equal(w[iden[comp[0]]], 0)
-#     os.unlink('encexp-es-mx.json.gz')
-#     assert np.all(enc.names == enc.bow.names)
-
-
-# def test_EncExp_iadd():
-#     """Test EncExp iadd"""
-
-#     from encexp.download import download_seqtm
-
-#     voc = download_seqtm(lang='es')
-#     samples()
-#     if not isfile('encexp-es-mx.json.gz'):
-#         build_encexp(voc, 'es-mx-sample.json', 'encexp-es-mx.json.gz',
-#                      min_pos=64)
-#     enc = EncExp(EncExp_filename='encexp-es-mx.json.gz')
-#     w = enc.weights
-#     enc += enc
-#     assert_almost_equal(w, enc.weights, decimal=4)
-#     os.unlink('encexp-es-mx.json.gz')
-#     enc2 = EncExp(lang='es', voc_source='noGeo')
-#     enc2 += enc
-#     enc2 = EncExp(lang='es', voc_source='noGeo')
-#     r = enc2 + enc2
-#     r.weights[:, :] = 0
-#     assert enc2.weights[0, 0] != 0
-
-
-# def test_EncExp_force_tokens():
-#     """Test force tokens"""
-
-#     enc = EncExp(lang='es', prefix_suffix=True,
-#                  precision=np.float16,
-#                  force_token=False)
-#     w = enc.weights
-#     _max = w.max(axis=1)
-#     rows = np.arange(len(enc.names))
-#     cols = np.array([enc.bow.token2id[x] for x in enc.names])
-#     assert_almost_equal(w[rows, cols], 0)
-#     enc = EncExp(lang='es', prefix_suffix=True,
-#                  precision=np.float16,
-#                  force_token=True)
-#     w[rows, cols] = _max
-#     assert_almost_equal(enc.weights, w)
-#     enc = EncExp(lang='es', prefix_suffix=True,
-#                  precision=np.float16, merge_IDF=False,
-#                  force_token=False)
-#     assert enc.weights[0, 0] == 0
-#     enc.force_tokens_weights(IDF=True)
-#     enc2 = EncExp(lang='es', prefix_suffix=True,
-#                   precision=np.float16, merge_IDF=False,
-#                   force_token=True)
-#     assert enc.weights[0, 0] != enc2.weights[0, 0]
-#     assert_almost_equal(enc.weights[0, 1:], enc2.weights[0, 1:])
-
-
-# def test_EncExp_enc_training_size():
-#     """Test training size of the embeddings"""
-
-#     enc = EncExp(lang='es')
-#     assert isinstance(enc.enc_training_size, dict)
-#     for k in enc.enc_training_size:
-#         assert k in enc.names
-
-
-# def test_EncExp_distance():
-#     """Test distance to hyperplane"""
-
-#     txt = 'buenos días'
-#     enc = EncExp(lang='es', transform_distance=True)
-#     assert enc.weights_norm.shape[0] == enc.weights.shape[0]
-#     X = enc.transform([txt])
-#     X2 = EncExp(lang='es',
-#                 transform_distance=False).transform([txt])
-#     assert np.fabs(X - X2).sum() != 0
-
-
-# def test_EncExp_unit_vector():
-#     """Test distance to hyperplane"""
-
-#     txt = 'buenos días'
-#     enc = EncExp(lang='es', unit_vector=False)
-#     X = enc.transform([txt])
-#     assert np.linalg.norm(X) != 1
-#     enc = EncExp(lang='es')
-#     X = enc.transform([txt])
-#     assert_almost_equal(np.linalg.norm(X), 1)
-
-
-# def test_EncExp_build_tailored():
-#     """Test the development of tailored models"""
-
-#     samples()
-#     mx = list(tweet_iterator('es-mx-sample.json'))
-#     samples(filename='es-ar-sample.json.zip')
-#     ar = list(tweet_iterator('es-ar-sample.json'))
-#     y = ['mx'] * len(mx)
-#     y += ['ar'] * len(ar)
-
-#     enc = EncExp(lang='es',
-#                  tailored=True)
-#     w = enc.weights
-#     enc.build_tailored(mx + ar, load=True)    
-#     assert isfile(enc.tailored)
-#     assert hasattr(enc, '_tailored_built')
-#     enc = EncExp(lang='es',
-#                  tailored=enc.tailored).fit(mx + ar, y)
-#     assert np.fabs(w - enc.weights).sum() != 0
-#     enc2 = clone(enc)
-#     assert hasattr(enc2, '_tailored_built')
-#     assert hasattr(enc2, '_estimator')
-#     # os.unlink(enc.tailored)
-
-
-# def test_pipeline_encexp():
-#     """Test Pipeline in EncExpT"""
-#     from sklearn.pipeline import Pipeline
-#     from sklearn.svm import LinearSVC
-#     from sklearn.model_selection import GridSearchCV
-#     from sklearn.model_selection import StratifiedShuffleSplit
-
-#     samples()
-#     mx = list(tweet_iterator('es-mx-sample.json'))
-#     samples(filename='es-ar-sample.json.zip')
-#     ar = list(tweet_iterator('es-ar-sample.json'))
-#     y = ['mx'] * len(mx)
-#     y += ['ar'] * len(ar)
-
-#     pipe = Pipeline([('encexp', EncExpT(lang='es')),
-#                      ('cl', LinearSVC(class_weight='balanced'))])
-#     params = {'cl__C': [0.01, 0.1, 1, 10],
-#               'encexp__voc_source': ['mix', 'noGeo']}
-#     sss = StratifiedShuffleSplit(random_state=0,
-#                                 n_splits=1,
-#                                 test_size=0.3)
-
-#     grid = GridSearchCV(pipe,
-#                         param_grid=params,
-#                         cv=sss,
-#                         n_jobs=1,
-#                         scoring='f1_macro').fit(mx + ar, y)
-#     assert grid.best_score_ > 0.7
+def test_EncExpT_transform_dtype():
+    """Test EncExpT transform type"""
+    enc = EncExpT(lang='es',
+                  token_max_filter=2**13)
+    X = enc.transform(['buenos dias'])
+    assert X.dtype == enc.precision