Merge pull request #56 from PetrochukM/update

PetrochukM · web-flow · commit 133a54ce8af2 · 2018-11-29T14:38:53.000-08:00
Release 0.3.7 - 5 fixed issues and a new label_encoder
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
-ignore = E402, E722, E731
+ignore = E402, E722, E731, W504
 max-line-length = 100
 exclude = examples/
diff --git a/examples/snli/train.py b/examples/snli/train.py
@@ -77,8 +77,8 @@
 best_dev_acc = -1
 header = '  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy'
 dev_log_template = ' '.join(
-    '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}'.
-    split(','))
+    '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}'
+    .split(','))
 log_template = ' '.join(
     '{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{},{:12.4f},{}'.split(','))
 makedirs(args.save_path)
@@ -108,8 +108,7 @@
         answer = model(premise_batch, hypothesis_batch)
 
         # calculate accuracy of predictions in the current batch
-        n_correct += (torch.max(answer,
-                                1)[1].view(label_batch.size()) == label_batch).sum()
+        n_correct += (torch.max(answer, 1)[1].view(label_batch.size()) == label_batch).sum()
         n_total += premise_batch.size()[1]
         train_acc = 100. * n_correct / n_total
 
@@ -150,8 +149,8 @@
             for dev_batch_idx, (premise_batch, hypothesis_batch,
                                 label_batch) in enumerate(dev_iterator):
                 answer = model(premise_batch, hypothesis_batch)
-                n_dev_correct += (torch.max(answer, 1)[1].view(
-                    label_batch.size()) == label_batch).sum()
+                n_dev_correct += (torch.max(answer,
+                                            1)[1].view(label_batch.size()) == label_batch).sum()
                 dev_loss = criterion(answer, label_batch)
             dev_acc = 100. * n_dev_correct / len(dev)
 
diff --git a/examples/snli/util.py b/examples/snli/util.py
@@ -57,10 +57,9 @@ def collate_fn(batch, train=True):
     """ list of tensors to a batch tensors """
     premise_batch, _ = pad_batch([row['premise'] for row in batch])
     hypothesis_batch, _ = pad_batch([row['hypothesis'] for row in batch])
-    label_batch = [row['label'] for row in batch]
+    label_batch = torch.stack([row['label'] for row in batch])
 
     # PyTorch RNN requires batches to be transposed for speed and integration with CUDA
-    transpose = (
-        lambda b: torch.stack(b).t_().squeeze(0).contiguous())
+    transpose = (lambda b: b.t_().squeeze(0).contiguous())
 
     return (transpose(premise_batch), transpose(hypothesis_batch), transpose(label_batch))
diff --git a/requirements.txt b/requirements.txt
@@ -19,12 +19,12 @@ flake8
 # Mocking
 mock
 
-# # Optional NLP Utilties
+# Optional NLP Utilties
 # nltk
 # spacy
 # sacremoses
 
-# # Optional CUDA Utilties
+# Optional CUDA Utilties
 # pynvrtc
 # cupy
 
diff --git a/setup.py b/setup.py
@@ -36,7 +36,7 @@ def find_version(*file_paths):
     long_description=long_description,
     long_description_content_type='text/markdown',
     license='BSD',
-    install_requires=['numpy', 'pandas', 'tqdm', 'ujson', 'requests'],
+    install_requires=['numpy', 'pandas', 'tqdm', 'requests'],
     classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',
diff --git a/tests/datasets/test_simple_qa.py b/tests/datasets/test_simple_qa.py
@@ -1,16 +1,16 @@
 import os
 import shutil
-import pytest
 
 import mock
+import pytest
 
 from torchnlp.datasets import simple_qa_dataset
 from tests.datasets.utils import urlretrieve_side_effect
 
 directory = 'tests/_test_data/'
 
 
-@pytest.mark.skip(reason="Simple Questions dataset url returns 404.")
+@pytest.mark.skip(reason="Simple Questions dataset url sometimes returns 404.")
 @mock.patch("urllib.request.urlretrieve")
 def test_simple_qa_dataset_row(mock_urlretrieve):
     mock_urlretrieve.side_effect = urlretrieve_side_effect
diff --git a/tests/datasets/test_smt.py b/tests/datasets/test_smt.py
@@ -35,7 +35,7 @@ def test_smt_dataset_row(mock_urlretrieve):
             " splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven" +
             " Segal .",
         'label':
-            'positive'
+            'very positive'
     }
 
     # Clean up
diff --git a/tests/nn/test_weight_drop.py b/tests/nn/test_weight_drop.py
@@ -27,7 +27,7 @@ def test_weight_drop_lstm():
     run2 = [x.sum() for x in wd_lstm(input_)[0].data]
 
     # First time step, not influenced by hidden to hidden weights, should be equal
-    assert pytest.approx(run1[0]) == pytest.approx(run2[0])
+    assert pytest.approx(run1[0].item()) == pytest.approx(run2[0].item())
     # Second step should not
     assert run1[1] != run2[1]
 
@@ -40,7 +40,7 @@ def test_weight_drop_gru():
     run2 = [x.sum() for x in wd_lstm(input_)[0].data]
 
     # First time step, not influenced by hidden to hidden weights, should be equal
-    assert pytest.approx(run1[0]) == pytest.approx(run2[0])
+    assert pytest.approx(run1[0].item()) == pytest.approx(run2[0].item())
     # Second step should not
     assert run1[1] != run2[1]
 
@@ -53,7 +53,7 @@ def test_weight_drop():
     run2 = [x.sum() for x in wd_lstm(input_)[0].data]
 
     # First time step, not influenced by hidden to hidden weights, should be equal
-    assert pytest.approx(run1[0]) == pytest.approx(run2[0])
+    assert pytest.approx(run1[0].item()) == pytest.approx(run2[0].item())
     # Second step should not
     assert run1[1] != run2[1]
 
@@ -66,6 +66,6 @@ def test_weight_drop_zero():
     run2 = [x.sum() for x in wd_lstm(input_)[0].data]
 
     # First time step, not influenced by hidden to hidden weights, should be equal
-    assert pytest.approx(run1[0]) == pytest.approx(run2[0])
+    assert pytest.approx(run1[0].item()) == pytest.approx(run2[0].item())
     # Second step should not
-    assert pytest.approx(run1[1]) == pytest.approx(run2[1])
+    assert pytest.approx(run1[1].item()) == pytest.approx(run2[1].item())
diff --git a/tests/test_label_encoder.py b/tests/test_label_encoder.py
@@ -0,0 +1,44 @@
+import pickle
+
+import pytest
+
+from torchnlp.label_encoder import LabelEncoder
+from torchnlp.label_encoder import UNKNOWN_TOKEN
+
+
+@pytest.fixture
+def encoder():
+    sample = ['people/deceased_person/place_of_death', 'symbols/name_source/namesakes']
+    return LabelEncoder(sample)
+
+
+def test_label_encoder_vocab(encoder):
+    assert len(encoder.vocab) == 3
+    assert len(encoder.vocab) == encoder.vocab_size
+
+
+def test_label_encoder_scalar(encoder):
+    input_ = 'symbols/namesake/named_after'
+    output = encoder.encode(input_)[0]
+    assert encoder.decode(output) == UNKNOWN_TOKEN
+
+
+def test_label_encoder_unknown(encoder):
+    input_ = 'symbols/namesake/named_after'
+    output = encoder.encode(input_)
+    assert len(output) == 1
+    assert encoder.decode(output) == UNKNOWN_TOKEN
+
+
+def test_label_encoder_known():
+    input_ = 'symbols/namesake/named_after'
+    sample = ['people/deceased_person/place_of_death', 'symbols/name_source/namesakes']
+    sample.append(input_)
+    encoder = LabelEncoder(sample)
+    output = encoder.encode(input_)
+    assert len(output) == 1
+    assert encoder.decode(output) == input_
+
+
+def test_is_pickleable(encoder):
+    pickle.dumps(encoder)
diff --git a/tests/text_encoders/test_subword_tokenizer.py b/tests/text_encoders/test_subword_tokenizer.py
@@ -89,8 +89,10 @@ def test_encode_decode(self):
 
         original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'
 
-        encoder = SubwordTextTokenizer.build_to_target_size_from_corpus(
-            [corpus, original], target_size=100, min_val=2, max_val=10)
+        encoder = SubwordTextTokenizer.build_to_target_size_from_corpus([corpus, original],
+                                                                        target_size=100,
+                                                                        min_val=2,
+                                                                        max_val=10)
 
         # Encoding should be reversible.
         encoded = encoder.encode(original)
diff --git a/torchnlp/__init__.py b/torchnlp/__init__.py
@@ -1 +1 @@
-__version__ = '0.3.5'
+__version__ = '0.3.7.post1'
diff --git a/torchnlp/datasets/count.py b/torchnlp/datasets/count.py
@@ -26,8 +26,9 @@ def count_dataset(train=False,
         seq_max_length (int, optional): Maximum sequence length.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset
-        , dev dataset and test dataset in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import count_dataset
diff --git a/torchnlp/datasets/dataset.py b/torchnlp/datasets/dataset.py
@@ -97,3 +97,6 @@ def __str__(self):
 
     def __eq__(self, other):
         return self.columns == other.columns and self.rows == other.rows
+
+    def __add__(self, other):
+        return Dataset(self.rows + other.rows)
diff --git a/torchnlp/datasets/imdb.py b/torchnlp/datasets/imdb.py
@@ -39,8 +39,9 @@ def imdb_dataset(directory='data/',
         sentiments (list of str, optional): Sentiments to load from the dataset.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset and
-        test dataset in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import imdb_dataset
diff --git a/torchnlp/datasets/iwslt.py b/torchnlp/datasets/iwslt.py
@@ -58,8 +58,9 @@ def iwslt_dataset(
         url (str, optional): URL of the dataset file.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
-        tokens and test tokens in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import iwslt_dataset
diff --git a/torchnlp/datasets/multi30k.py b/torchnlp/datasets/multi30k.py
@@ -58,8 +58,9 @@ def multi30k_dataset(directory='data/multi30k/',
         urls (str, optional): URLs to download.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
-        tokens and test tokens in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import multi30k_dataset
diff --git a/torchnlp/datasets/penn_treebank.py b/torchnlp/datasets/penn_treebank.py
@@ -45,8 +45,9 @@ def penn_treebank_dataset(
         urls (str, optional): URLs to download.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
-        tokens and test tokens in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import penn_treebank_dataset
diff --git a/torchnlp/datasets/reverse.py b/torchnlp/datasets/reverse.py
@@ -26,8 +26,9 @@ def reverse_dataset(train=False,
         seq_max_length (int, optional): Maximum sequence length.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset
-        , dev dataset and test dataset in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import reverse_dataset
diff --git a/torchnlp/datasets/simple_qa.py b/torchnlp/datasets/simple_qa.py
@@ -39,8 +39,9 @@ def simple_qa_dataset(directory='data/',
         url (str, optional): URL of the dataset `tar.gz` file.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset
-        , dev dataset and test dataset in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import simple_qa_dataset
diff --git a/torchnlp/datasets/smt.py b/torchnlp/datasets/smt.py
@@ -108,9 +108,9 @@ def smt_dataset(directory='data/',
             for line in f:
                 line = line.strip()
                 if subtrees:
-                    examples.extend(parse_tree(line, subtrees=subtrees))
+                    examples.extend(parse_tree(line, subtrees=subtrees, fine_grained=fine_grained))
                 else:
-                    examples.append(parse_tree(line, subtrees=subtrees))
+                    examples.append(parse_tree(line, subtrees=subtrees, fine_grained=fine_grained))
         ret.append(Dataset(examples))
 
     if len(ret) == 1:
diff --git a/torchnlp/datasets/snli.py b/torchnlp/datasets/snli.py
@@ -1,7 +1,7 @@
 import os
 import io
 
-import ujson as json
+import json
 
 from torchnlp.download import download_file_maybe_extract
 from torchnlp.datasets.dataset import Dataset
@@ -47,8 +47,9 @@ def snli_dataset(directory='data/',
         url (str, optional): URL of the dataset `tar.gz` file.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
-        tokens and test tokens in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import snli_dataset
diff --git a/torchnlp/datasets/trec.py b/torchnlp/datasets/trec.py
@@ -40,8 +40,9 @@ def trec_dataset(directory='data/trec/',
         urls (str, optional): URLs to download.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
-        tokens and test tokens in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import trec_dataset
diff --git a/torchnlp/datasets/ud_pos.py b/torchnlp/datasets/ud_pos.py
@@ -44,8 +44,9 @@ def ud_pos_dataset(directory='data/',
         url (str, optional): URL of the dataset `tar.gz` file.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
-        tokens and test tokens in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import ud_pos_dataset
diff --git a/torchnlp/datasets/wikitext_2.py b/torchnlp/datasets/wikitext_2.py
@@ -40,8 +40,9 @@ def wikitext_2_dataset(
         url (str, optional): URL of the dataset `tar.gz` file.
 
     Returns:
-        :class:`tuple` of :class:`list` of :class:`str`: Tuple with the training tokens, dev tokens
-        and test tokens in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import wikitext_2_dataset
diff --git a/torchnlp/datasets/wmt.py b/torchnlp/datasets/wmt.py
@@ -45,8 +45,9 @@ def wmt_dataset(directory='data/wmt16_en_de',
         url (str, optional): URL of the dataset `tar.gz` file.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training tokens, dev
-        tokens and test tokens in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import wmt_dataset
diff --git a/torchnlp/datasets/zero.py b/torchnlp/datasets/zero.py
@@ -18,8 +18,9 @@ def zero_dataset(train=False, dev=False, test=False, train_rows=256, dev_rows=64
         test_rows (int, optional): Number of test rows to generate.
 
     Returns:
-        :class:`tuple` of :class:`torchnlp.datasets.Dataset`: Tuple with the training dataset
-        , dev dataset and test dataset in order if their respective boolean argument is true.
+        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
+        Returns between one and all dataset splits (train, dev and test) depending on if their
+        respective boolean argument is ``True``.
 
     Example:
         >>> from torchnlp.datasets import zero_dataset
diff --git a/torchnlp/encoder.py b/torchnlp/encoder.py
@@ -1,4 +1,4 @@
-class TextEncoder(object):
+class Encoder(object):
     """ Base class for a text encoder.
     """
 
diff --git a/torchnlp/label_encoder.py b/torchnlp/label_encoder.py
diff --git a/torchnlp/text_encoders/__init__.py b/torchnlp/text_encoders/__init__.py
diff --git a/torchnlp/text_encoders/identity_encoder.py b/torchnlp/text_encoders/identity_encoder.py
diff --git a/torchnlp/text_encoders/spacy_encoder.py b/torchnlp/text_encoders/spacy_encoder.py
diff --git a/torchnlp/text_encoders/static_tokenizer_encoder.py b/torchnlp/text_encoders/static_tokenizer_encoder.py
diff --git a/torchnlp/text_encoders/subword_encoder.py b/torchnlp/text_encoders/subword_encoder.py
diff --git a/torchnlp/text_encoders/subword_text_tokenizer.py b/torchnlp/text_encoders/subword_text_tokenizer.py
diff --git a/torchnlp/utils.py b/torchnlp/utils.py

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def test_smt_dataset_row(mock_urlretrieve):`
`35`	`35`	`" splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven" +`
`36`	`36`	`" Segal .",`
`37`	`37`	`'label':`
`38`		`- 'positive'`
	`38`	`+ 'very positive'`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	`# Clean up`