NNPDF
diff --git a/‎n3fit/src/n3fit/hyper_optimization/hyper_scan.py‎
Lines changed: 66 additions & 86 deletions b/‎n3fit/src/n3fit/hyper_optimization/hyper_scan.py‎
Lines changed: 66 additions & 86 deletions
@@ -15,12 +15,11 @@
 
 import copy
 import logging
-import os
 
+import hyperopt
 from hyperopt.pyll.base import scope
 import numpy as np
 
-import hyperopt
 from n3fit.backends import MetaLayer, MetaModel
 from n3fit.hyper_optimization.filetrials import FileTrials
 
@@ -101,6 +100,8 @@ def optimizer_arg_wrapper(hp_key, option_dict):
             choice = hp_uniform(hp_key, min_lr, max_lr)
         elif sampling == "log":
             choice = hp_loguniform(hp_key, min_lr, max_lr)
+        else:
+            raise ValueError(f"Sampling {sampling} not understood")
     return choice
 
 
@@ -129,58 +130,49 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     # Tell the trainer we are doing hpyeropt
     model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys)
 
-    if hyperscanner.restart_hyperopt:
-        # For parallel hyperopt restarts, extract the database tar file
-        if hyperscanner.parallel_hyperopt:
-            tar_file_to_extract = f"{replica_path_set}/{hyperscanner.db_name}.tar.gz"
-            log.info("Restarting hyperopt run using the MongoDB database %s", tar_file_to_extract)
-            MongoFileTrials.extract_mongodb_database(tar_file_to_extract, path=os.getcwd())
-        else:
-            # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file
-            pickle_file_to_load = f"{replica_path_set}/tries.pkl"
-            log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load)
-            trials = FileTrials.from_pkl(pickle_file_to_load)
-
+    # Generate the trials object, as a MongoFileTrial or a simple sequential FileTrial
     if hyperscanner.parallel_hyperopt:
-        # start MongoDB database by launching `mongod`
-        hyperscanner.mongod_runner.ensure_database_dir_exists()
-        mongod = hyperscanner.mongod_runner.start()
+        # If we are running in parallel:
+        # 1) Check whether the database is already on, start it otherwise
+        if not hyperscanner.mongod_runner.is_up():
+            hyperscanner.mongod_runner.start()
 
-    # Generate the trials object
-    if hyperscanner.parallel_hyperopt:
-        # Instantiate `MongoFileTrials`
-        # Mongo database should have already been initiated at this point
+        # Instantiate `MongoFileTrials` as trials to give to the worker later
         trials = MongoFileTrials(
             replica_path_set,
-            db_host=hyperscanner.db_host,
-            db_port=hyperscanner.db_port,
-            db_name=hyperscanner.db_name,
-            num_workers=hyperscanner.num_mongo_workers,
+            hyperscanner.mongod_runner,
+            num_workers=1,  # Only one worker per n3fit job will run
             parameters=hyperscanner.as_dict(),
         )
     else:
+        # If we are not running in parallel, check whether there's a pickle to load and restart
+        # For sequential hyperopt restarts, reset the state of `FileTrials` saved in the pickle file
+        pickle_file_to_load = replica_path_set / "tries.pkl"
+        if pickle_file_to_load.exists():
+            log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load)
+            trials = FileTrials.from_pkl(pickle_file_to_load)
         # Instantiate `FileTrials`
         trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
 
     # Initialize seed for hyperopt
     trials.rstate = np.random.default_rng(HYPEROPT_SEED)
+    # And prepare the generic arguments to fmin
+    fmin_args = {
+        "fn": model_trainer.hyperparametrizable,
+        "space": hyperscanner.as_dict(),
+        "algo": hyperopt.tpe.suggest,
+        "max_evals": max_evals,
+        "trials": trials,
+        "rstate": trials.rstate,
+    }
 
-    # Call to hyperopt.fmin
-    fmin_args = dict(
-        fn=model_trainer.hyperparametrizable,
-        space=hyperscanner.as_dict(),
-        algo=hyperopt.tpe.suggest,
-        max_evals=max_evals,
-        trials=trials,
-        rstate=trials.rstate,
-    )
     if hyperscanner.parallel_hyperopt:
         trials.start_mongo_workers()
-        hyperopt.fmin(**fmin_args, show_progressbar=True, max_queue_len=trials.num_workers)
+        # TODO benchmark how the behaviour depends on max_queue_len (if it does)
+        hyperopt.fmin(**fmin_args, show_progressbar=True, max_queue_len=4)
         trials.stop_mongo_workers()
         # stop mongod command and compress database
-        hyperscanner.mongod_runner.stop(mongod)
-        trials.compress_mongodb_database()
+        hyperscanner.mongod_runner.stop()
     else:
         hyperopt.fmin(**fmin_args, show_progressbar=False, trials_save_file=trials.pkl_file)
 
@@ -212,56 +204,47 @@ class HyperScanner:
     It takes cares of known correlation between parameters by tying them together
     It also provides methods for updating the parameter dictionaries after using hyperopt
 
-    It takes as inpujt the dictionaries defining the NN/fit and the hyperparameter scan
+    It takes as input the dictionaries defining the NN/fit and the hyperparameter scan
     from the NNPDF runcard and substitutes in `parameters` samplers according to the
     `hyper_scan` dictionary.
 
+    In the sampling dict,
+
+
+    Parameters
+    ----------
+        `parameters`: dict
+            the `fitting[parameters]` dictionary of the NNPDF runcard
+        `sampling_dict`: dict
+            the `hyperscan` dictionary of the NNPDF runcard defining the search space of the scan
+        `steps`: int
+            when taking discrete steps between two parameters, number of steps to take
 
-    # Arguments:
-        - `parameters`: the `fitting[parameters]` dictionary of the NNPDF runcard
-        - `sampling_dict`: the `hyperscan` dictionary of the NNPDF runcard defining
-                           the search space of the scan
-        - `steps`: when taking discrete steps between two parameters, number of steps
-                   to take
-
-    # Parameters accepted by `sampling_dict`:
-        - `stopping`:
-                - min_epochs, max_epochs
-                - min_patience, max_patience
     """
 
-    def __init__(self, parameters, sampling_dict, steps=5):
+    def __init__(
+        self, parameters, sampling_dict, steps=5, db_host=None, db_port=None, db_path=None
+    ):
         self._original_parameters = parameters
         self.parameter_keys = parameters.keys()
         self.parameters = copy.deepcopy(parameters)
         self.steps = steps
 
-        # adding extra options for restarting
-        restart_config = sampling_dict.get("restart")
-        self.restart_hyperopt = True if restart_config else False
-
         # adding extra options for parallel execution
-        parallel_config = sampling_dict.get("parallel")
-        if parallel_config is None:
-            self.parallel_hyperopt = False
-        elif _has_pymongo:
+        self._db_path = db_path
+        self._db_host = db_host
+        self._db_port = db_port
+        self.mongod_runner = None
+        self.parallel_hyperopt = False
+
+        if db_path is not None:
+            # If we get a db_path, assume we want to run in parallel, therefore check whether we can
+            if not _has_pymongo:
+                raise ModuleNotFoundError(
+                    "Could not import pymongo modules, please install with `.[parallelhyperopt]`"
+                )
             self.parallel_hyperopt = True
-        else:
-            raise ModuleNotFoundError(
-                "Could not import pymongo modules, please install with `.[parallelhyperopt]`"
-            )
-
-        self.parallel_hyperopt = True if parallel_config else False
-
-        # setting up MondoDB options
-        if self.parallel_hyperopt:
-            # add output_path to db name to avoid conflicts
-            db_name = f'{sampling_dict.get("db_name")}-{sampling_dict.get("output_path")}'
-            self.db_host = sampling_dict.get("db_host")
-            self.db_port = sampling_dict.get("db_port")
-            self.db_name = db_name
-            self.num_mongo_workers = sampling_dict.get("num_mongo_workers")
-            self.mongod_runner = MongodRunner(self.db_name, self.db_port)
+            self.mongod_runner = MongodRunner(self._db_path, self._db_host, self._db_port)
 
         self.hyper_keys = set([])
 
@@ -323,14 +306,11 @@ def _update_param(self, key, sampler):
 
         if key not in self.parameter_keys and key != "parameters":
             raise ValueError(
-                "Trying to update a parameter not declared in the `parameters` dictionary: {0} @ HyperScanner._update_param".format(
-                    key
-                )
+                f"Trying to update a parameter not declared in the `parameters` dictionary: {key} @ HyperScanner._update_param"
             )
 
         self.hyper_keys.add(key)
-        log.info("Adding key {0} with value {1}".format(key, sampler))
-
+        log.info(f"Adding key {key} with value {sampler}")
         self.parameters[key] = sampler
 
     def stopping(self, min_epochs=None, max_epochs=None, min_patience=None, max_patience=None):
@@ -376,8 +356,8 @@ def optimizer(self, optimizers):
             ]
         and will sample one from this list.
 
-        Note that the keys within the dictionary (`optimizer_name` and `learning_rate`) should be named
-        as the keys used by the compiler of the model as they are used as they come.
+        Note that the keys within the dictionary (`optimizer_name` and `learning_rate`)
+        should be named as the keys used by the compiler of the model.
         """
         # Get all accepted optimizer to check against
         all_optimizers = MetaModel.accepted_optimizers
@@ -393,7 +373,7 @@ def optimizer(self, optimizers):
             name = optimizer[optname_key]
             optimizer_dictionary = {optname_key: name}
 
-            if name not in all_optimizers.keys():
+            if name not in all_optimizers:
                 raise NotImplementedError(
                     f"HyperScanner: Optimizer {name} not implemented in MetaModel.py"
                 )
@@ -476,8 +456,8 @@ def architecture(
         else:
             if min_units is None or max_units is None:
                 raise ValueError(
-                    "A max/min number of units must always be defined if the number of layers is to be sampled"
-                    "i.e., make sure you add the keywords 'min_units' and 'max_units' to the 'architecutre' dict"
+                    "A max/min number of units must always be defined when the number of layers"
+                    "is to be sampled, i.e., add 'min_units' and 'max_units' to 'architecture' dict"
                 )
 
         activation_key = "activation_per_layer"
@@ -497,7 +477,7 @@ def architecture(
         for n in n_layers:
             units = []
             for i in range(n):
-                units_label = "nl{0}:-{1}/{0}".format(n, i)
+                units_label = f"nl{n}:-{i}/{n}"
                 units_sampler = hp_quniform(
                     units_label, min_units, max_units, step_size=1, make_int=True
                 )
@@ -516,7 +496,7 @@ def architecture(
         for ini_name in initializers:
             if ini_name not in imp_init_names:
                 raise NotImplementedError(
-                    "HyperScanner: Initializer {0} not implemented in MetaLayer.py".format(ini_name)
+                    f"HyperScanner: Initializer {ini_name} not implemented in MetaLayer.py"
                 )
             # For now we are going to use always all initializers and with default values
             ini_choices.append(ini_name)