@@ -58,8 +58,8 @@ To train new models, the training data needs to be prepared. This process is aut
5858| nickname = "Language" | (str) | The nickname for saving / loading models
5959| divide_data = True | (boolean) | If True, crawl for dataset; if False, just load it
6060| test_samples = 20 | (int) | The number of files for each class to use for testing
61- | threshold = 100 | (int) | Number of files required before language or country is included in the model
62- | samples_per_epoch = 5 | (int) | Number of samples to use per training epoch for language-domain or language-country pairs
61+ | threshold = 100 | (int) | Number of files required before language/ country is included in model
62+ | samples_per_epoch = 5 | (int) | Number of samples to use per training epoch
6363| language = "" | (str) | For DID, specifies the language of the current model
6464| lid_sample_size = 200 | (int) | For LID, the number of characters to allow per sample
6565| did_sample_size = 1 | (int) | For DID, the number of 100 word samples to combine
@@ -69,15 +69,17 @@ To train new models, the training data needs to be prepared. This process is aut
6969| class_constraints = [ ] | (list of strs) | Option to constrain the number of classes
7070| merge_dict = {} | (dict) | Original: New name keys
7171
72- id.train(model_type, lid_features, lid_ngrams, did_grammar, c2xg_workers, mlp_sizes, cross_val, dropout, activation, optimizer )
72+ id.train()
7373
74- model_type = "MLP" #(str): MLP or SVM
75- lid_features = 524288 #(int): Number of character n-gram features to allow, hashing only
76- lid_ngrams = (1,3) #(tuple of ints): Range of n-grams to hash
77- did_grammar = ".Grammar.p" #(str): Name of C2xG grammar to use for annotation; allows comparison of different grammars
78- c2xg_workers = 1 #(int): For DID, number of workers for c2xg enrichments
79- mlp_sizes = (300, 300, 300) #(tuple of ints): Size and number of layers; e.g., 3 layers at 300 neurons each
80- cross_val = False #(boolean): Whether to use cross-validation rather than a held-out test set
81- dropout = 0.25 #(float): The amount of dropout to apply to each layer
82- activation = "relu" #(str): The type of activation; just passes name to Keras
83- optimizer = "sgd" #(str): The type of optimization algorithm; just passes name to Keras
74+ | Argument | Type | Description |
75+ | ------------------ | ------------ | -------------------------- |
76+ | model_type = "MLP" | (str) | MLP or SVM |
77+ | lid_features = 524288 | (int) | Number of character n-gram features to allow, hashing only |
78+ | lid_ngrams = (1,3) | (tuple of ints) | Range of n-grams to hash |
79+ | did_grammar = ".Grammar.p" | (str) | Name of C2xG grammar to use for annotation; allows comparison of different grammars |
80+ | c2xg_workers = 1 | (int) | For DID, number of workers for c2xg enrichments |
81+ | mlp_sizes = (300, 300, 300) | (tuple of ints) | Size and number of layers; e.g., 3 layers at 300 neurons each |
82+ | cross_val = False | (boolean) | Whether to use cross-validation rather than a held-out test set |
83+ | dropout = 0.25 | (float) | The amount of dropout to apply to each layer |
84+ | activation = "relu" | (str) | The type of activation; just passes name to Keras |
85+ | optimizer = "sgd" | (str) | The type of optimization algorithm; just passes name to Keras |
0 commit comments