Implement vocabulary expansion

Hironsan · Hironsan · commit 9cb828f2c676 · 2017-11-24T11:44:29.000+09:00
diff --git a/anago/preprocess.py b/anago/preprocess.py
@@ -237,8 +237,8 @@ def dense_to_one_hot(labels_dense, num_classes, nlevels=1):
         raise ValueError('nlevels can take 1 or 2, not take {}.'.format(nlevels))
 
 
-def prepare_preprocessor(X, y, use_char=True):
-    p = WordPreprocessor()
+def prepare_preprocessor(X, y, use_char=True, vocab_init=None):
+    p = WordPreprocessor(vocab_init=vocab_init)
     p.fit(X, y)
 
     return p
diff --git a/anago/wrapper.py b/anago/wrapper.py
@@ -34,8 +34,8 @@ def __init__(self, char_emb_size=25, word_emb_size=100, char_lstm_units=25,
         self.log_dir = log_dir
         self.embeddings = embeddings
 
-    def train(self, x_train, y_train, x_valid=None, y_valid=None):
-        self.p = prepare_preprocessor(x_train, y_train)
+    def train(self, x_train, y_train, x_valid=None, y_valid=None, vocab_init=None):
+        self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init)
         embeddings = filter_embeddings(self.embeddings, self.p.vocab_word,
                                        self.model_config.word_embedding_size)
         self.model_config.vocab_size = len(self.p.vocab_word)
diff --git a/tests/wrapper_test.py b/tests/wrapper_test.py
@@ -2,6 +2,8 @@
 import unittest
 from pprint import pprint
 
+import numpy as np
+
 import anago
 from anago.reader import load_data_and_labels, load_glove
 
@@ -80,3 +82,12 @@ def test_load(self):
 
         model = anago.Sequence.load(self.dir_path)
         model.eval(self.x_test, self.y_test)
+
+    def test_train_vocab_init(self):
+        vocab = set()
+        for words in np.r_[self.x_train, self.x_valid, self.x_test]:
+            for word in words:
+                vocab.add(word)
+        model = anago.Sequence(max_epoch=15, embeddings=self.embeddings, log_dir='logs')
+        model.train(self.x_train, self.y_train, self.x_test, self.y_test, vocab_init=vocab)
+        model.save(dir_path=self.dir_path)