Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion encexp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
if not '-m' in sys.argv:
from encexp.text_repr import EncExpT, SeqTM, TextModel

__version__ = "0.1.6"
__version__ = "0.1.7"
312 changes: 6 additions & 306 deletions encexp/tests/test_text_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,309 +235,9 @@ def test_TextModel_diac():
assert len(lst) > 3



# def test_EncExp_filename():
# """Test EncExp"""
# if not isfile('encexp-es-mx.json.gz'):
# samples()
# data = compute_b4msa_vocabulary('es-mx-sample.json')
# voc = compute_seqtm_vocabulary(SeqTM, data,
# 'es-mx-sample.json',
# voc_size_exponent=10)
# build_encexp(voc, 'es-mx-sample.json', 'encexp-es-mx.json.gz')
# enc = EncExp(EncExp_filename='encexp-es-mx.json.gz')
# assert enc.weights.dtype == np.float32
# assert len(enc.names) == 12
# os.unlink('encexp-es-mx.json.gz')


# def test_EncExp():
# """Test EncExp"""
# enc = EncExp(precision=np.float16)
# assert enc.weights.dtype == np.float16
# assert len(enc.names) == 8192


# def test_EncExp_encode():
# """Test EncExp encode"""

# dense = EncExp(precision=np.float16)
# assert dense.encode('buenos días').shape[1] == 2


# def test_EncExp_transform():
# """Test EncExp transform"""

# encexp = EncExp()
# X = encexp.transform(['buenos dias'])
# assert X.shape[0] == 1
# assert X.shape[1] == 8192
# assert X.dtype == np.float32


# def test_EncExp_prefix_suffix():
# """Test EncExp prefix/suffix"""

# encexp = EncExp(lang='es',
# precision=np.float16,
# prefix_suffix=True)
# for k in encexp.bow.names:
# if k[:2] != 'q:':
# continue
# if len(k) >= 6:
# continue
# assert k[3] == '~' or k[-1] == '~'


# def test_EncExp_fit():
# """Test EncExp fit"""
# from sklearn.svm import LinearSVC
# samples()
# mx = list(tweet_iterator('es-mx-sample.json'))
# samples(filename='es-ar-sample.json.zip')
# ar = list(tweet_iterator('es-ar-sample.json'))
# y = ['mx'] * len(mx)
# y += ['ar'] * len(ar)
# enc = EncExp(lang='es',
# prefix_suffix=True,
# precision=np.float16).fit(mx + ar, y)
# assert isinstance(enc.estimator, LinearSVC)
# hy = enc.predict(ar)
# assert hy.shape[0] == len(ar)
# df = enc.decision_function(ar)
# assert df.shape[0] == len(ar)
# assert df.dtype == np.float64


# def test_EncExp_fit_sgd():
# """Test EncExp fit"""
# from sklearn.linear_model import SGDClassifier
# from itertools import repeat
# samples()
# mx = list(tweet_iterator('es-mx-sample.json'))
# samples(filename='es-ar-sample.json.zip')
# ar = list(tweet_iterator('es-ar-sample.json'))
# y = ['mx'] * len(mx)
# y += ['ar'] * len(ar)
# D = mx + ar
# # while len(D) < 2**17:
# for i in range(5):
# D.extend(D)
# y.extend(y)
# D.append(D[0])
# y.append(y[0])
# enc = EncExp(lang='es').fit(D, y)
# assert isinstance(enc.estimator, SGDClassifier)
# hy = enc.predict(ar)
# assert hy.shape[0] == len(ar)
# df = enc.decision_function(ar)
# assert df.shape[0] == len(ar)
# assert df.dtype == np.float64


# def test_EncExp_train_predict_decision_function():
# """Test EncExp train_predict_decision_function"""
# samples()
# mx = list(tweet_iterator('es-mx-sample.json'))
# samples(filename='es-ar-sample.json.zip')
# ar = list(tweet_iterator('es-ar-sample.json'))
# samples(filename='es-es-sample.json.zip')
# es = list(tweet_iterator('es-es-sample.json'))
# y = ['mx'] * len(mx)
# y += ['ar'] * len(ar)
# enc = EncExp(lang='es',
# prefix_suffix=True,
# precision=np.float16)
# hy = enc.train_predict_decision_function(mx + ar, y)
# assert hy.ndim == 2 and hy.shape[0] == len(y) and hy.shape[1] == 1
# y += ['es'] * len(es)
# hy = enc.train_predict_decision_function(mx + ar + es, y)
# assert hy.shape[1] == 3 and hy.shape[0] == len(y)


# def test_EncExp_clone():
# """Test EncExp clone"""

# enc = EncExp(lang='es', prefix_suffix=True,
# precision=np.float16)
# enc2 = clone(enc)
# assert isinstance(enc2, EncExp)
# assert np.all(enc2.weights == enc.weights)


# def test_EncExp_merge_IDF():
# """Test EncExp without keyword's weight"""

# enc = EncExp(lang='es', prefix_suffix=True,
# precision=np.float16, merge_IDF=False,
# force_token=False)
# enc.fill(inplace=True)

# for k, v in enc.bow.token2id.items():
# assert enc.weights[v, v] == 0
# enc2 = EncExp(lang='es', prefix_suffix=True,
# precision=np.float16, merge_IDF=True,
# force_token=False)
# enc2.fill(inplace=True)
# _ = (enc.weights * enc.bow.weights).astype(enc.precision)
# assert_almost_equal(_, enc2.weights, decimal=5)


# def test_EncExp_fill():
# """Test EncExp fill weights"""
# from encexp.download import download_seqtm

# voc = download_seqtm(lang='es')
# samples()
# if not isfile('encexp-es-mx.json.gz'):
# build_encexp(voc, 'es-mx-sample.json', 'encexp-es-mx.json.gz',
# min_pos=64)
# enc = EncExp(EncExp_filename='encexp-es-mx.json.gz')
# iden = {v:k for k, v in enumerate(enc.bow.names)}
# comp = [x for x in enc.bow.names if x not in enc.names]
# key = enc.names[0]
# enc.weights
# w = enc.fill()
# assert np.any(w[iden[key]] != 0)
# assert_almost_equal(w[iden[comp[0]]], 0)
# os.unlink('encexp-es-mx.json.gz')
# assert np.all(enc.names == enc.bow.names)


# def test_EncExp_iadd():
# """Test EncExp iadd"""

# from encexp.download import download_seqtm

# voc = download_seqtm(lang='es')
# samples()
# if not isfile('encexp-es-mx.json.gz'):
# build_encexp(voc, 'es-mx-sample.json', 'encexp-es-mx.json.gz',
# min_pos=64)
# enc = EncExp(EncExp_filename='encexp-es-mx.json.gz')
# w = enc.weights
# enc += enc
# assert_almost_equal(w, enc.weights, decimal=4)
# os.unlink('encexp-es-mx.json.gz')
# enc2 = EncExp(lang='es', voc_source='noGeo')
# enc2 += enc
# enc2 = EncExp(lang='es', voc_source='noGeo')
# r = enc2 + enc2
# r.weights[:, :] = 0
# assert enc2.weights[0, 0] != 0


# def test_EncExp_force_tokens():
# """Test force tokens"""

# enc = EncExp(lang='es', prefix_suffix=True,
# precision=np.float16,
# force_token=False)
# w = enc.weights
# _max = w.max(axis=1)
# rows = np.arange(len(enc.names))
# cols = np.array([enc.bow.token2id[x] for x in enc.names])
# assert_almost_equal(w[rows, cols], 0)
# enc = EncExp(lang='es', prefix_suffix=True,
# precision=np.float16,
# force_token=True)
# w[rows, cols] = _max
# assert_almost_equal(enc.weights, w)
# enc = EncExp(lang='es', prefix_suffix=True,
# precision=np.float16, merge_IDF=False,
# force_token=False)
# assert enc.weights[0, 0] == 0
# enc.force_tokens_weights(IDF=True)
# enc2 = EncExp(lang='es', prefix_suffix=True,
# precision=np.float16, merge_IDF=False,
# force_token=True)
# assert enc.weights[0, 0] != enc2.weights[0, 0]
# assert_almost_equal(enc.weights[0, 1:], enc2.weights[0, 1:])


# def test_EncExp_enc_training_size():
# """Test training size of the embeddings"""

# enc = EncExp(lang='es')
# assert isinstance(enc.enc_training_size, dict)
# for k in enc.enc_training_size:
# assert k in enc.names


# def test_EncExp_distance():
# """Test distance to hyperplane"""

# txt = 'buenos días'
# enc = EncExp(lang='es', transform_distance=True)
# assert enc.weights_norm.shape[0] == enc.weights.shape[0]
# X = enc.transform([txt])
# X2 = EncExp(lang='es',
# transform_distance=False).transform([txt])
# assert np.fabs(X - X2).sum() != 0


# def test_EncExp_unit_vector():
# """Test distance to hyperplane"""

# txt = 'buenos días'
# enc = EncExp(lang='es', unit_vector=False)
# X = enc.transform([txt])
# assert np.linalg.norm(X) != 1
# enc = EncExp(lang='es')
# X = enc.transform([txt])
# assert_almost_equal(np.linalg.norm(X), 1)


# def test_EncExp_build_tailored():
# """Test the development of tailored models"""

# samples()
# mx = list(tweet_iterator('es-mx-sample.json'))
# samples(filename='es-ar-sample.json.zip')
# ar = list(tweet_iterator('es-ar-sample.json'))
# y = ['mx'] * len(mx)
# y += ['ar'] * len(ar)

# enc = EncExp(lang='es',
# tailored=True)
# w = enc.weights
# enc.build_tailored(mx + ar, load=True)
# assert isfile(enc.tailored)
# assert hasattr(enc, '_tailored_built')
# enc = EncExp(lang='es',
# tailored=enc.tailored).fit(mx + ar, y)
# assert np.fabs(w - enc.weights).sum() != 0
# enc2 = clone(enc)
# assert hasattr(enc2, '_tailored_built')
# assert hasattr(enc2, '_estimator')
# # os.unlink(enc.tailored)


# def test_pipeline_encexp():
# """Test Pipeline in EncExpT"""
# from sklearn.pipeline import Pipeline
# from sklearn.svm import LinearSVC
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import StratifiedShuffleSplit

# samples()
# mx = list(tweet_iterator('es-mx-sample.json'))
# samples(filename='es-ar-sample.json.zip')
# ar = list(tweet_iterator('es-ar-sample.json'))
# y = ['mx'] * len(mx)
# y += ['ar'] * len(ar)

# pipe = Pipeline([('encexp', EncExpT(lang='es')),
# ('cl', LinearSVC(class_weight='balanced'))])
# params = {'cl__C': [0.01, 0.1, 1, 10],
# 'encexp__voc_source': ['mix', 'noGeo']}
# sss = StratifiedShuffleSplit(random_state=0,
# n_splits=1,
# test_size=0.3)

# grid = GridSearchCV(pipe,
# param_grid=params,
# cv=sss,
# n_jobs=1,
# scoring='f1_macro').fit(mx + ar, y)
# assert grid.best_score_ > 0.7
def test_EncExpT_transform_dtype():
"""Test EncExpT transform type"""
enc = EncExpT(lang='es',
token_max_filter=2**13)
X = enc.transform(['buenos dias'])
assert X.dtype == enc.precision
Loading
Loading