diff --git a/.gitmodules b/.gitmodules index 419e0e99..e69de29b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,8 +0,0 @@ -[submodule "pythonCode/submodules/MEDimage"] - path = pythonCode/submodules/MEDimage - url = git@github.com:MahdiAll99/MEDimage.git - branch = dev_lab -[submodule "pythonCode/submodules/MEDprofiles"] - path = pythonCode/submodules/MEDprofiles - url = git@github.com:MEDomics-UdeS/MEDprofiles.git - branch = fusion_MEDomicsLab diff --git a/pythonCode/med_libs/GoExecutionScript.py b/pythonCode/med_libs/GoExecutionScript.py index 691000d9..1241c79b 100644 --- a/pythonCode/med_libs/GoExecutionScript.py +++ b/pythonCode/med_libs/GoExecutionScript.py @@ -17,19 +17,27 @@ def parse_arguments() -> tuple[dict, str]: parser = argparse.ArgumentParser() parser.add_argument('--json-param', type=str, default='.') parser.add_argument('--id', type=str, default='.') + parser.add_argument('--debug', type=bool, default=False) args = parser.parse_args() - json_params = json.loads(args.json_param) - id_ = args.id - return json_params, id_ + if not args.debug: + json_params = json.loads(args.json_param) + id_ = args.id + return json_params, id_ + else: + with open('json_params_dict.json', 'r') as f: + return json.load(f), '1234-4567-debug-id' -def get_response_from_error(e=None, toast=None): +def get_response_from_error(e=None, toast=None) -> dict: """ Gets the response from an error Args: e: The error toast: The toast message to send to the client, ignored if e is not None + + Returns: + The response dictionary """ if e is not None: print(e) @@ -58,11 +66,16 @@ class GoExecutionScript(ABC): _id: The id of the process """ - def __init__(self, json_params: dict, _id: str = "default_id"): + def __init__(self, json_params: dict, _id: str = "default_id", debug: bool = False): self._json_params = json_params self._error_handler = None self._progress = {"now": 0, "currentLabel": ""} self._id = _id + self._debug = debug + if self._debug: + # save json_params_dict to a file + with open('json_params_dict.json', 'w') as f: + json.dump(json_params, f, indent=4) def start(self): """ @@ -71,6 +84,9 @@ def start(self): try: self.push_progress() results = self._custom_process(self._json_params) + if self._debug: + with open("results.json", "w") as f: + f.write(json.dumps(results, indent=4)) self.send_response(results) except BaseException as e: if self._error_handler is not None: diff --git a/pythonCode/med_libs/MEDml/MEDexperiment.py b/pythonCode/med_libs/MEDml/MEDexperiment.py index f5b99a9f..8a02330d 100644 --- a/pythonCode/med_libs/MEDml/MEDexperiment.py +++ b/pythonCode/med_libs/MEDml/MEDexperiment.py @@ -68,9 +68,7 @@ def update(self, global_json_config: json = None): """Updates the experiment with the pipelines and the global configuration. Args: - pipelines (json, optional): The pipelines of the experiment. Defaults to None. global_json_config (json, optional): The global configuration of the experiment. Defaults to None. - nb_nodes (float, optional): The number of nodes in the experiment. Defaults to 0. """ self.pipelines = global_json_config['pipelines'] self.pipelines_to_execute = self.pipelines @@ -97,14 +95,12 @@ def create_next_nodes(self, next_nodes: json, pipelines_objects: dict) -> dict: nodes = {} if next_nodes != {}: for current_node_id, next_nodes_id_json in next_nodes.items(): - # if it is a create_model node, we need to point to the model node + # if it is a train_model node, we need to point to the model node # To be consistent with the rest of the nodes, # we create a new node with the same parameters but with the model id tmp_subid_list = current_node_id.split('*') if len(tmp_subid_list) > 1: - self.global_json_config['nodes'][current_node_id] = \ - copy.deepcopy( - self.global_json_config['nodes'][tmp_subid_list[0]]) + self.global_json_config['nodes'][current_node_id] = self.global_json_config['nodes'][tmp_subid_list[0]] self.global_json_config['nodes'][current_node_id]['associated_id'] = tmp_subid_list[1] self.global_json_config['nodes'][current_node_id]['id'] = current_node_id # then, we create the node normally @@ -113,6 +109,7 @@ def create_next_nodes(self, next_nodes: json, pipelines_objects: dict) -> dict: nodes[current_node_id] = self.handle_node_creation( node, pipelines_objects) nodes[current_node_id]['obj'].just_run = False + # if the node has next nodes if current_node_id in pipelines_objects: nodes[current_node_id]['next_nodes'] = \ self.create_next_nodes(next_nodes_id_json, @@ -234,15 +231,17 @@ def execute_next_nodes(self, prev_node: Node, next_nodes_to_execute: json, next_ if next_nodes_to_execute != {}: for current_node_id, next_nodes_id_json in next_nodes_to_execute.items(): + node_can_go = True node_info = next_nodes[current_node_id] + node = node_info['obj'] experiment = self.copy_experiment(experiment) exp_to_return = experiment - node = node_info['obj'] self._progress['currentLabel'] = node.username if not node.has_run() or prev_node.has_changed(): + data = node.execute(experiment, **prev_node.get_info_for_next_node()) node_info['results'] = { 'prev_node_id': prev_node.id, - 'data': node.execute(experiment, **prev_node.get_info_for_next_node()), + 'data': data, } # Clean node return experiment if "experiment" in node_info['results']['data']: @@ -273,7 +272,7 @@ def execute_next_nodes(self, prev_node: Node, next_nodes_to_execute: json, next_ results=results[current_node_id]['next_nodes'], experiment=exp_to_return ) - print(f'flag-{node.username}') + print(f'END-{node.username}') @abstractmethod def modify_node_info(self, node_info: dict, node: Node, experiment: dict): @@ -378,3 +377,27 @@ def set_progress(self, now: int = -1, label: str = "same") -> None: label = self._progress['currentLabel'] self._progress = {'currentLabel': label, 'now': now} + def make_save_ready(self): + """Makes the experiment ready to be saved. + """ + self._make_save_ready_rec(self.pipelines_objects) + + @abstractmethod + def _make_save_ready_rec(self, next_nodes: dict): + """ + Recursive function that makes the experiment ready to be saved. + """ + pass + + def init_obj(self): + """ + Initializes the experiment object (pycaret) from a path. + """ + self._init_obj_rec(self.pipelines_objects) + + @abstractmethod + def _init_obj_rec(self, next_nodes: dict): + """ + Recursive function that initializes the experiment object (pycaret) from a path. + """ + pass diff --git a/pythonCode/med_libs/MEDml/MEDexperiment_learning.py b/pythonCode/med_libs/MEDml/MEDexperiment_learning.py index 1ae0a641..d1b7285d 100644 --- a/pythonCode/med_libs/MEDml/MEDexperiment_learning.py +++ b/pythonCode/med_libs/MEDml/MEDexperiment_learning.py @@ -74,6 +74,9 @@ def create_Node(self, node_config: dict): elif node_type == "finalize": from med_libs.MEDml.nodes.Finalize import Finalize return Finalize(node_config['id'], self.global_json_config) + elif node_type == "group_models": + from med_libs.MEDml.nodes.GroupModels import GroupModels + return GroupModels(node_config['id'], self.global_json_config) def setup_dataset(self, node: Node): """Sets up the dataset for the experiment.\n @@ -111,6 +114,12 @@ def setup_dataset(self, node: Node): elif kwargs['use_gpu'] == "False": kwargs['use_gpu'] = False + if 'index' in kwargs: + if kwargs['index'] == "True": + kwargs['index'] = True + elif kwargs['index'] == "False": + kwargs['index'] = False + # add the imports node.CodeHandler.add_import("import numpy as np") node.CodeHandler.add_import("import pandas as pd") @@ -135,9 +144,16 @@ def setup_dataset(self, node: Node): medml_logger = MEDml_logger() # setup the experiment - pycaret_exp.setup(temp_df, log_experiment=medml_logger, **kwargs) - node.CodeHandler.add_line( - "code", f"pycaret_exp.setup(temp_df, {node.CodeHandler.convert_dict_to_params(kwargs)})") + if 'test_data' in kwargs: + test_data_df = pd.read_csv(kwargs['test_data']['path']) + node.CodeHandler.add_line("code", f"test_data_df = pd.read_csv('{kwargs['test_data']}'") + node.CodeHandler.add_line("code", f"pycaret_exp.setup(temp_df, test_data=test_data_df, {node.CodeHandler.convert_dict_to_params(kwargs)})") + del kwargs['test_data'] + pycaret_exp.setup(temp_df, test_data=test_data_df, log_experiment=medml_logger, **kwargs) + else: + pycaret_exp.setup(temp_df, log_experiment=medml_logger, **kwargs) + node.CodeHandler.add_line("code", f"pycaret_exp.setup(temp_df, {node.CodeHandler.convert_dict_to_params(kwargs)})") + node.CodeHandler.add_line( "code", f"dataset = pycaret_exp.get_config('X').join(pycaret_exp.get_config('y'))") dataset_metaData = { @@ -169,3 +185,32 @@ def setup_dataset(self, node: Node): 'df': temp_df } + def _make_save_ready_rec(self, next_nodes: dict): + for node_id, node_content in next_nodes.items(): + saved_path = os.path.join( + self.global_json_config['internalPaths']['exp'], f"exp_{node_id.replace('*', '--')}.pycaretexp") + if 'exp_path' in node_content['experiment']: + saved_path = node_content['experiment']['exp_path'] + + data = node_content['experiment']['pycaret_exp'].data + self.sceneZipFile.write_to_zip( + custom_actions=lambda path: node_content['experiment']['pycaret_exp'].save_experiment(saved_path)) + node_content['experiment']['exp_path'] = saved_path + node_content['experiment']['dataset'] = data + node_content['experiment']['pycaret_exp'] = None + self._make_save_ready_rec(node_content['next_nodes']) + + def _init_obj_rec(self, next_nodes: dict): + for node_id, node_content in next_nodes.items(): + data = node_content['experiment']['dataset'] + pycaret_exp = create_pycaret_exp( + ml_type=self.global_json_config['MLType']) + saved_path = node_content['experiment']['exp_path'] + + def get_experiment(pycaret_exp, data, saved_path): + return pycaret_exp.load_experiment(saved_path, data=data) + + node_content['experiment']['pycaret_exp'] = self.sceneZipFile.read_in_zip( + custom_actions=lambda path: get_experiment(pycaret_exp, data, saved_path)) + + self._init_obj_rec(node_content['next_nodes']) diff --git a/pythonCode/med_libs/MEDml/nodes/Analyze.py b/pythonCode/med_libs/MEDml/nodes/Analyze.py index 0eadb02b..3c34fd93 100644 --- a/pythonCode/med_libs/MEDml/nodes/Analyze.py +++ b/pythonCode/med_libs/MEDml/nodes/Analyze.py @@ -40,7 +40,7 @@ def _execute(self, experiment: dict = None, **kwargs) -> json: """ selection = self.config_json['data']['internal']['selection'] print() - print(Fore.BLUE + "=== Analysing === " + 'paths' + + print(Fore.BLUE + "=== Analysing === " + Fore.YELLOW + f"({self.username})" + Fore.RESET) print(Fore.CYAN + f"Using {selection}" + Fore.RESET) settings = copy.deepcopy(self.settings) diff --git a/pythonCode/med_libs/MEDml/nodes/GroupModels.py b/pythonCode/med_libs/MEDml/nodes/GroupModels.py new file mode 100644 index 00000000..432b4f44 --- /dev/null +++ b/pythonCode/med_libs/MEDml/nodes/GroupModels.py @@ -0,0 +1,94 @@ +import pandas as pd +import copy +import numpy as np +import json + +from sklearn.pipeline import Pipeline + +from .NodeObj import Node, format_model, NodeCodeHandler +from typing import Union +from colorama import Fore +from med_libs.server_utils import go_print + +DATAFRAME_LIKE = Union[dict, list, tuple, np.ndarray, pd.DataFrame] +TARGET_LIKE = Union[int, str, list, tuple, np.ndarray, pd.Series] + + +class GroupModels(Node): + """ + This class represents the GroupModels node. + """ + + def __init__(self, id_: int, global_config_json: json) -> None: + """ + Args: + id_ (int): The id of the node. + global_config_json (json): The global config json. + """ + super().__init__(id_, global_config_json) + self.config_json['instance'] = 0 + # print(f"GroupModels: {json.dumps(self.global_config_json, indent=4)}") + self.models_list = sorted(self.config_json['associated_id'].split('.')) + self.config_json['cur_models_list_id'] = [] + self.config_json['cur_models_list_obj'] = [] + self.config_json['cur_models_list_settings'] = [] + print(f"GroupModels: {self.models_list}") + print(f"{self.config_json['cur_models_list_id']}") + + def _execute(self, experiment: dict = None, **kwargs) -> json: + """ + This function is used to execute the node. + """ + self.config_json['instance'] += 1 + self.config_json['cur_models_list_id'] += [kwargs['id'].split('*')[0]] + self.config_json['cur_models_list_settings'] += [kwargs.get('settings', {})] if 'settings' in kwargs else [] + print() + print(Fore.BLUE + "=== GroupModels === " + Fore.YELLOW + f"({self.username})" + Fore.RESET) + print(self.config_json['instance']) + trained_models = kwargs['models'] + trained_models_json = {} + + for model in kwargs['models']: + model = format_model(model) + print(Fore.CYAN + f"Grouping: {model.__class__.__name__}" + Fore.RESET) + + trained_models_copy = trained_models.copy() + self.config_json['cur_models_list_obj'] += trained_models_copy + self._info_for_next_node = {'models': self.config_json['cur_models_list_obj'], 'id': self.id} + self.CodeHandler.add_line("code", f"trained_models = []") + + isLast = sorted(self.config_json['cur_models_list_id']) == self.models_list or len( + self.config_json['cur_models_list_id']) > len(self.models_list) + if isLast: + for settings in self.config_json['cur_models_list_settings']: + model_string = format_model_process(settings) + self.CodeHandler.add_line("code", f"trained_models += {[model_str['content'] for model_str in model_string]}".replace("\"", "")) + return {"prev_node_complete": isLast} + + +def format_model_process(settings): + """ + This function is used to format the model process. + + Args: + settings (dict): The settings. + + Returns: + List[dict] : The formatted model process. + """ + codeHandler = NodeCodeHandler() + codeHandler.reset() + settings_cp = copy.deepcopy(settings) + fct_type = settings_cp['fct_type'] + del settings_cp['fct_type'] + if fct_type == 'compare_models': + codeHandler.add_line( + "code", + f"pycaret_exp.compare_models({codeHandler.convert_dict_to_params(settings_cp)})") + + elif fct_type == 'train_model': + codeHandler.add_line( + "code", + f"pycaret_exp.create_model({codeHandler.convert_dict_to_params(settings_cp)})") + + return codeHandler.get_code() diff --git a/pythonCode/med_libs/MEDml/nodes/ModelHandler.py b/pythonCode/med_libs/MEDml/nodes/ModelHandler.py index 66fc4205..75a26723 100644 --- a/pythonCode/med_libs/MEDml/nodes/ModelHandler.py +++ b/pythonCode/med_libs/MEDml/nodes/ModelHandler.py @@ -67,7 +67,9 @@ def _execute(self, experiment: dict = None, **kwargs) -> json: f"trained_models = [pycaret_exp.create_model({self.CodeHandler.convert_dict_to_params(settings)})]" ) trained_models_copy = trained_models.copy() - self._info_for_next_node = {'models': trained_models} + settings_for_next = copy.deepcopy(settings) + settings_for_next['fct_type'] = self.type + self._info_for_next_node = {'models': trained_models, 'id': self.id, 'settings': settings_for_next} for model in trained_models_copy: model_copy = copy.deepcopy(model) trained_models_json[model_copy.__class__.__name__] = model_copy.__dict__ diff --git a/pythonCode/med_libs/MEDml/nodes/ModelIO.py b/pythonCode/med_libs/MEDml/nodes/ModelIO.py index e2afba40..cfebde1a 100644 --- a/pythonCode/med_libs/MEDml/nodes/ModelIO.py +++ b/pythonCode/med_libs/MEDml/nodes/ModelIO.py @@ -125,6 +125,6 @@ def _execute(self, experiment: dict = None, **kwargs) -> json: "code", f"pycaret_exp.load_model({self.CodeHandler.convert_dict_to_params(settings_copy)})" ) - self._info_for_next_node = {'models': [trained_model]} + self._info_for_next_node = {'models': [trained_model], 'id': self.id} return return_val diff --git a/pythonCode/med_libs/MEDml/nodes/Optimize.py b/pythonCode/med_libs/MEDml/nodes/Optimize.py index ac1e1fdc..bfc346b4 100644 --- a/pythonCode/med_libs/MEDml/nodes/Optimize.py +++ b/pythonCode/med_libs/MEDml/nodes/Optimize.py @@ -42,8 +42,7 @@ def _execute(self, experiment: dict = None, **kwargs) -> json: if "models" in self.type: self.CodeHandler.add_line( "code", - f"optimized_model = pycaret_exp.{self.type}(trained_models, {self.CodeHandler.convert_dict_to_params(settings)})", - 1) + f"optimized_model = pycaret_exp.{self.type}(trained_models, {self.CodeHandler.convert_dict_to_params(settings)})", 1) trained_models.append(getattr(experiment['pycaret_exp'], self.type)(input_models, **settings)) else: self.CodeHandler.add_line("code", f"trained_models_optimized = []") @@ -65,12 +64,11 @@ def _execute(self, experiment: dict = None, **kwargs) -> json: self.CodeHandler.add_line( "code", f"trained_models = trained_models_optimized") trained_models_copy = trained_models.copy() - self._info_for_next_node = {'models': trained_models} + self._info_for_next_node = {'models': trained_models, 'id': self.id} for model in trained_models_copy: model_copy = copy.deepcopy(model) trained_models_json[model_copy.__class__.__name__] = model_copy.__dict__ for key, value in model_copy.__dict__.items(): if isinstance(value, np.ndarray): - trained_models_json[model_copy.__class__.__name__][key] = value.tolist( - ) + trained_models_json[model_copy.__class__.__name__][key] = value.tolist() return trained_models_json diff --git a/pythonCode/med_libs/MEDml/utils/settings_generator/settings_generator.py b/pythonCode/med_libs/MEDml/utils/settings_generator/settings_generator.py index 67e9de48..fb59e4a5 100644 --- a/pythonCode/med_libs/MEDml/utils/settings_generator/settings_generator.py +++ b/pythonCode/med_libs/MEDml/utils/settings_generator/settings_generator.py @@ -12,6 +12,11 @@ import dpath.util as dp from collections.abc import MutableMapping +# EXAMPLE USAGE +# cd pythonCode/med_libs/MEDml/utils/settings_generator +# python settings_generator.py --ml_type classification + + # python scripts arguments import argparse parser = argparse.ArgumentParser(description='Script so useful.') @@ -163,6 +168,10 @@ "info": ['load_model'], "code": """""" }, + 'group_models': { + "info": [], + "code": """""" + }, } @@ -352,6 +361,39 @@ def specific_case(dict_settings: dict) -> dict: del dict_settings['analyze']['interpret_model']['options']['save'] del dict_settings['load_model']['options']['model_name'] + # NOT SUPPORTED + del dict_settings['dataset']['options']['data_func'] + del dict_settings['dataset']['options']['ordinal_features'] + del dict_settings['dataset']['options']['encoding_method'] + del dict_settings['dataset']['options']['group_features'] + del dict_settings['dataset']['options']['custom_pipeline'] + del dict_settings['dataset']['options']['experiment_custom_tags'] + del dict_settings['dataset']['options']['engine'] + del dict_settings['dataset']['options']['memory'] + del dict_settings['dataset']['options']['profile'] + del dict_settings['dataset']['options']['profile_kwargs'] + del dict_settings['compare_models']['options']['engine'] + del dict_settings['compare_models']['options']['fit_kwargs'] + del dict_settings['create_model']['options']['engine'] + del dict_settings['create_model']['options']['fit_kwargs'] + del dict_settings['create_model']['options']['experiment_custom_tags'] + del dict_settings['analyze']['plot_model']['options']['fit_kwargs'] + del dict_settings['analyze']['plot_model']['options']['plot_kwargs'] + del dict_settings['analyze']['dashboard']['options']['dashboard_kwargs'] + del dict_settings['analyze']['dashboard']['options']['run_kwargs'] + del dict_settings['finalize']['options']['fit_kwargs'] + del dict_settings['finalize']['options']['experiment_custom_tags'] + del dict_settings['load_model']['options']['platform'] + del dict_settings['load_model']['options']['authentication'] + del dict_settings['tune_model']['options']['custom_grid'] + del dict_settings['tune_model']['options']['custom_scorer'] + del dict_settings['tune_model']['options']['fit_kwargs'] + del dict_settings['ensemble_model']['options']['fit_kwargs'] + del dict_settings['blend_models']['options']['fit_kwargs'] + del dict_settings['stack_models']['options']['fit_kwargs'] + if ml_type == "classification": + del dict_settings['calibrate_model']['options']['fit_kwargs'] + return dict_settings diff --git a/pythonCode/modules/learning/run_experiment.py b/pythonCode/modules/learning/run_experiment.py index 1cc65a38..946c9ed5 100644 --- a/pythonCode/modules/learning/run_experiment.py +++ b/pythonCode/modules/learning/run_experiment.py @@ -75,5 +75,59 @@ def _update_progress_periodically(self): self.push_progress() time.sleep(1.0 / self._progress_update_frequency_HZ) +def save_experiment(experiment: MEDexperimentLearning): + """ + triggered by the button save in the dashboard, it saves the pipeline execution + + Returns: the results of the pipeline execution + """ + + # A ÉTÉ ENLEVÉ DANS UN COMMIT PRÉCÉDENT... À REMETTRE ET UPDATER POUR MONGODB + # go_print("saving experiment") + # experiment.make_save_ready() + # basePath = str(Path(os.path.dirname(os.path.abspath(__file__))).parent.parent) + # local_path = os.path.join(basePath, 'local_dir') + # if not os.path.exists(local_path): + # os.makedirs(local_path) + # with open(os.path.join(local_path, 'MEDexperiment_' + experiment.id + '.medexp'), 'wb') as f: + # pickle.dump(experiment, f) + # del experiment + pass + + + +def load_experiment(id_): + """ + triggered by the button load in the dashboard, it loads the pipeline execution + + Returns: the previously saved MEDexperiment + """ + # A ÉTÉ ENLEVÉ DANS UN COMMIT PRÉCÉDENT... À REMETTRE ET UPDATER POUR MONGODB + # go_print("loading experiment") + # basePath = str(Path(os.path.dirname(os.path.abspath(__file__))).parent.parent) + # local_path = os.path.join(basePath, 'local_dir') + # if not os.path.exists(local_path): + # os.makedirs(local_path) + # with open(os.path.join(local_path, 'MEDexperiment_' + id_ + '.medexp'), 'rb') as f: + # experiment = pickle.load(f) + # experiment.init_obj() + # return experiment + pass + + +def is_experiment_exist(id_): + """ + triggered by the button load in the dashboard, it loads the pipeline execution + + Returns: the results of the pipeline execution + """ + # A ÉTÉ ENLEVÉ DANS UN COMMIT PRÉCÉDENT... À REMETTRE ET UPDATER POUR MONGODB + # basePath = str(Path(os.path.dirname(os.path.abspath(__file__))).parent.parent) + # local_path = os.path.join(basePath, 'local_dir') + # if not os.path.exists(local_path): + # os.makedirs(local_path) + # return os.path.exists(os.path.join(local_path, 'MEDexperiment_' + id_ + '.medexp')) + pass + run_experiment = GoExecScriptRunExperiment(json_params_dict, id_, True) run_experiment.start() diff --git a/renderer/components/flow/node.jsx b/renderer/components/flow/node.jsx index c6777e49..9d672e58 100644 --- a/renderer/components/flow/node.jsx +++ b/renderer/components/flow/node.jsx @@ -53,6 +53,7 @@ const NodeObject = ({ id, data, nodeSpecific, nodeBody, defaultSettings, onClick // update warnings when the node is loaded useEffect(() => { updateHasWarning(data) + console.log(data.internal) }, []) /** @@ -116,7 +117,7 @@ const NodeObject = ({ id, data, nodeSpecific, nodeBody, defaultSettings, onClick pt={{ body: { className: `${nodeBody ? "padding-0_2rem-important" : "padding-0-important"}` } }} - onClick={(e) => (onClickCustom ? onClickCustom(e) : op.current.toggle(e))} + onClick={(e) => (onClickCustom ? onClickCustom(e) : data.internal.type != "group_models" && op.current.toggle(e))} // if the node has run and the results pane is displayed, the node is displayed normally // if the node has not run and the results pane is displayed, the node is displayed with a notRun class (see .css file) className={`text-left ${data.internal.hasRun && showResultsPane ? "" : showResultsPane ? "notRun" : ""}`} @@ -163,11 +164,11 @@ const NodeObject = ({ id, data, nodeSpecific, nodeBody, defaultSettings, onClick {nodeBody && <>{nodeBody}} - {!isGroupNode && ( + {!isGroupNode && data.internal.type != "group_models" && ( <> {/* here is an overlay panel that is displayed when the user clicks on the node name. It contains the settings of the node*/} op.current.hide(e)}> - +
diff --git a/renderer/components/flow/results/pipelinesResults.jsx b/renderer/components/flow/results/pipelinesResults.jsx index 78dbf1cc..8b7400aa 100644 --- a/renderer/components/flow/results/pipelinesResults.jsx +++ b/renderer/components/flow/results/pipelinesResults.jsx @@ -28,6 +28,9 @@ import { FlowResultsContext } from "../context/flowResultsContext" */ const checkIfObjectContainsId = (obj, id) => { let res = false + if (!obj) { + return res + } Object.keys(obj).forEach((key) => { if (key.includes(id)) { res = obj[key] @@ -77,24 +80,42 @@ const PipelineResult = ({ pipeline, selectionMode, flowContent }) => { let selectedNode = flowContent.nodes.find((node) => node.id == selectedId) let resultsCopy = deepCopy(flowResults) let selectedResults = false + let isGroupModelsFinal = true + let isAfterGroupModels = false + let passGroupModels = false pipeline.forEach((id) => { resultsCopy = checkIfObjectContainsId(resultsCopy, id) if (resultsCopy) { + let curNode = flowContent.nodes.find((node) => node.id == id) + console.log("curNode", curNode, "type", curNode.data.internal.type) + if (curNode.data.internal.type == "group_models") { + console.log("FLAG") + passGroupModels = true + console.log("passGroupModels", passGroupModels) + } if (id == selectedId) { selectedResults = resultsCopy.results - } else { - resultsCopy = resultsCopy.next_nodes + isAfterGroupModels = passGroupModels + } + + console.log("resultsCopy", resultsCopy) + if (resultsCopy.results && resultsCopy.results.data && "prev_node_complete" in resultsCopy.results.data) { + console.log("prev_node_complete", resultsCopy.results.data.prev_node_complete, isAfterGroupModels) + + isGroupModelsFinal = resultsCopy.results.data.prev_node_complete } + resultsCopy = resultsCopy.next_nodes } else { !selectedNode.data.internal.hasRun && (toReturn =
Has not been run yet !
) } }) + console.log("isAfterGroupModels", isAfterGroupModels) console.log("selectedResults", selectedResults) - if (selectedResults) { + if (selectedResults && (isGroupModelsFinal || !isAfterGroupModels)) { let type = selectedNode.data.internal.type console.log("type", type) if (type == "dataset" || type == "clean") { - toReturn = + toReturn = } else if (["train_model", "compare_models", "stack_models", "ensemble_model", "tune_model", "blend_models", "calibrate_model"].includes(type)) { toReturn = } else if (type == "analyze") { @@ -104,8 +125,6 @@ const PipelineResult = ({ pipeline, selectionMode, flowContent }) => { } else { toReturn =
Results not available for this node type
} - } else { - toReturn =
Has not been run yet !
} } @@ -191,8 +210,16 @@ const PipelinesResults = ({ pipelines, selectionMode, flowContent }) => { const createTitleFromPipe = useCallback( (pipeline) => { let pipelineId = pipeline.join("-") - const getName = (id) => { + const getName = (id, pipeline = null) => { let node = flowContent.nodes.find((node) => node.id == id) + if (pipeline) { + let nextNode = pipeline.indexOf(id) + 1 < pipeline.length ? flowContent.nodes.find((node) => node.id == pipeline[pipeline.indexOf(id) + 1]) : null + // if (nextNode && nextNode.data.internal.type == "group_models") { + // let prevEdges = flowContent.edges.filter((edge) => edge.target == nextNode.id) + // let prevIds = prevEdges.map((edge) => edge.source) + // return prevIds.map((id) => getName(id)).join(" & ") + // } + } return node && node.data.internal.name } @@ -234,12 +261,22 @@ const PipelinesResults = ({ pipelines, selectionMode, flowContent }) => { console.log("code generation", pipeline) let resultsCopy = deepCopy(flowResults) console.log("resultsCopy", resultsCopy) - pipeline.forEach((id) => { + pipeline.forEach((id, index) => { + // check if next node is a group_models + let nextNode = index + 1 < pipeline.length ? flowContent.nodes.find((node) => node.id == pipeline[index + 1]) : null + console.log("nextNode", nextNode) + let isNextGroupModels = nextNode && nextNode.data.internal.type == "group_models" + if (isNextGroupModels) { + console.log("next node is a group_models") + console.log(checkIfObjectContainsId(resultsCopy, id)) + } let nodeResults = checkIfObjectContainsId(resultsCopy, id) if (nodeResults) { - finalCode = [...finalCode, ...Object.values(nodeResults.results.code.content)] - console.log("imports", Object.values(nodeResults.results.code.imports)) - finalImports = [...finalImports, ...Object.values(nodeResults.results.code.imports)] + if (!isNextGroupModels) { + finalCode = [...finalCode, ...Object.values(nodeResults.results.code.content)] + console.log("imports", Object.values(nodeResults.results.code.imports)) + finalImports = [...finalImports, ...Object.values(nodeResults.results.code.imports)] + } resultsCopy = nodeResults.next_nodes } else { console.log("id " + id + " not found in results") @@ -249,7 +286,7 @@ const PipelinesResults = ({ pipelines, selectionMode, flowContent }) => { console.log("final code:") console.log(finalImports) let notebookID = await createNoteBookDoc(finalCode, finalImports) - lockDataset(flowResults, notebookID) // Lock the dataset to avoid the user to modify or delete it + lockDataset(flowResults, notebookID) // Lock the dataset to avoid the user to modify or delete it } /** @@ -264,7 +301,7 @@ const PipelinesResults = ({ pipelines, selectionMode, flowContent }) => { */ const createNoteBookDoc = async (code, imports) => { let newLineChar = "\n" // before was process.platform === "linux" ? "\n" : "" - let notebook = loadJsonPath([getBasePath(EXPERIMENTS), sceneName, "notebooks", pipeline.map((id) => getName(id)).join("-")].join(getPathSeparator()) + ".ipynb") + let notebook = loadJsonPath([getBasePath(EXPERIMENTS), sceneName, "notebooks", pipeline.map((id) => getName(id, pipeline)).join("-")].join(getPathSeparator()) + ".ipynb") notebook = notebook ? deepCopy(notebook) : deepCopy(loadJsonPath(isProd ? Path.join(process.resourcesPath, "baseFiles", "emptyNotebook.ipynb") : "./baseFiles/emptyNotebook.ipynb")) notebook.cells = [] let lastType = "md" @@ -302,7 +339,12 @@ const PipelinesResults = ({ pipelines, selectionMode, flowContent }) => { } } // HEADER - addMarkdown(["## Notebook automatically generated\n\n", "**Scene:** " + sceneName + "\n\n", "**Pipeline:** " + pipeline.map((id) => getName(id)).join(" ➡️ ") + "\n\n", "**Date:** " + new Date().toLocaleString() + "\n\n"]) + addMarkdown([ + "## Notebook automatically generated\n\n", + "**Scene:** " + sceneName + "\n\n", + "**Pipeline:** " + pipeline.map((id) => getName(id, pipeline)).join(" ➡️ ") + "\n\n", + "**Date:** " + new Date().toLocaleString() + "\n\n" + ]) // IMPORTS addCode(imports.map((imp) => imp.content + newLineChar)) // CODE @@ -319,12 +361,7 @@ const PipelinesResults = ({ pipelines, selectionMode, flowContent }) => { compileLines(linesOfSameType) // Save the notebook locally - const pathToCreate = MEDDataObject.writeFileSync( - notebook, - [getBasePath(EXPERIMENTS), sceneName, "notebooks"], - pipeline.map((id) => getName(id)).join("-"), - "ipynb" - ) + const pathToCreate = MEDDataObject.writeFileSync(notebook, [getBasePath(EXPERIMENTS), sceneName, "notebooks"], pipeline.map((id) => getName(id, pipeline)).join("-"), "ipynb") // Update the notebooks MEDDATAObject path const db = await connectToMongoDB() @@ -335,7 +372,7 @@ const PipelinesResults = ({ pipelines, selectionMode, flowContent }) => { // Save the notebook in the database const notebookObj = new MEDDataObject({ id: randomUUID(), - name: pipeline.map((id) => getName(id)).join("-") + ".ipynb", + name: pipeline.map((id) => getName(id, pipeline)).join("-") + ".ipynb", type: "ipynb", parentID: notebooksFolder.id, childrenIDs: [], @@ -408,11 +445,13 @@ const PipelinesResults = ({ pipelines, selectionMode, flowContent }) => { return ( setAccordionActiveIndex(e.index)} className="pipeline-results-accordion"> - {pipelines.map((pipeline, index) => ( - - - - ))} + {pipelines.map((pipeline, index) => { + return ( + + + + ) + })} ) } diff --git a/renderer/components/flow/results/resultsPane.jsx b/renderer/components/flow/results/resultsPane.jsx index 6a46da4f..34ae7b93 100644 --- a/renderer/components/flow/results/resultsPane.jsx +++ b/renderer/components/flow/results/resultsPane.jsx @@ -109,6 +109,7 @@ const ResultsPane = () => { let isValid = true path.forEach((id) => { let node = flowContent.nodes.find((node) => node.id == id) + // this condition is here because a group node creates another path that is not valid if (node.type == "groupNode") { isValid = false } diff --git a/renderer/components/flow/workflowBase.jsx b/renderer/components/flow/workflowBase.jsx index b9dcabf8..34f04dbc 100644 --- a/renderer/components/flow/workflowBase.jsx +++ b/renderer/components/flow/workflowBase.jsx @@ -156,13 +156,13 @@ const WorkflowBase = ({ isGoodConnection, groupNodeHandlingDefault, onDeleteNode const setHasRunRec = (obj) => { Object.keys(obj).forEach((id) => { setHasRun(id) - setHasRunRec(obj[id].next_nodes) + obj[id].next_nodes && setHasRunRec(obj[id].next_nodes) }) } if (Object.keys(flowResults).length > 0) { Object.keys(flowResults).forEach((id) => { setHasRun(id) - setHasRunRec(flowResults[id].next_nodes) + flowResults[id].next_nodes && setHasRunRec(flowResults[id].next_nodes) }) } else { nodes.forEach((node) => { @@ -245,10 +245,12 @@ const WorkflowBase = ({ isGoodConnection, groupNodeHandlingDefault, onDeleteNode const setHasRunRec = (obj) => { Object.keys(obj).forEach((id) => { - Object.keys(obj[id].next_nodes).forEach((nextId) => { - setHasRun(id, nextId) - }) - setHasRunRec(obj[id].next_nodes) + if (obj[id].next_nodes) { + Object.keys(obj[id].next_nodes).forEach((nextId) => { + setHasRun(id, nextId) + }) + obj[id].next_nodes && setHasRunRec(obj[id].next_nodes) + } }) } @@ -256,7 +258,7 @@ const WorkflowBase = ({ isGoodConnection, groupNodeHandlingDefault, onDeleteNode Object.keys(flowResults[id].next_nodes).forEach((nextId) => { setHasRun(id, nextId) }) - setHasRunRec(flowResults[id].next_nodes) + flowResults[id].next_nodes && setHasRunRec(flowResults[id].next_nodes) }) edges.forEach((edge) => { edge.data ? (edge.data.hasRun = edgesHasRun.includes(edge.id)) : (edge.data = { hasRun: edgesHasRun.includes(edge.id) }) diff --git a/renderer/components/learning/input.jsx b/renderer/components/learning/input.jsx index 64cf2100..ed847b07 100644 --- a/renderer/components/learning/input.jsx +++ b/renderer/components/learning/input.jsx @@ -232,7 +232,7 @@ const Input = ({ name, settingInfos, currentValue, onInputChange, disabled = fal return ( <> - { @@ -255,11 +255,7 @@ const Input = ({ name, settingInfos, currentValue, onInputChange, disabled = fal type: settingInfos.type }) }} - > - - - - + /> {createTooltip(settingInfos.tooltip, name)} @@ -274,14 +270,13 @@ const Input = ({ name, settingInfos, currentValue, onInputChange, disabled = fal {...customProps} disabled={disabled} value={{ name: currentValue }} - onChange={(e) =>{ + onChange={(e) => { setInputUpdate({ name: name, value: e.target.value.name, type: settingInfos.type }) - } - } + }} options={Object.entries(settingInfos.choices).map(([option]) => { return { name: option @@ -302,16 +297,14 @@ const Input = ({ name, settingInfos, currentValue, onInputChange, disabled = fal disabled={disabled} value={currentValue ? currentValue : []} filter - onChange={(newValue) => - { + onChange={(newValue) => { setInputUpdate({ name: name, value: newValue.value, type: settingInfos.type }) - currentValue = {name: newValue.value[0]} - } - } + currentValue = { name: newValue.value[0] } + }} options={Object.entries(settingInfos.choices).map(([option]) => { return { name: settingInfos.choices[option], @@ -525,7 +518,33 @@ const Input = ({ name, settingInfos, currentValue, onInputChange, disabled = fal {createTooltip(settingInfos.tooltip, name)} ) - + case "dataframe": + return ( + <> + + { + console.log("e", e, path) + if (path == "") { + setHasWarning({ state: true, tooltip:

No file selected

}) + } else { + setHasWarning({ state: false }) + } + setInputUpdate({ + name: name, + value: { name: e.target.value, path: path }, + type: settingInfos.type + }) + }} + /> +
+ {createTooltip(settingInfos.tooltip, name)} + + ) // for all the other types of input (basically a string input for now) default: return ( diff --git a/renderer/components/learning/modalSettingsChooser.jsx b/renderer/components/learning/modalSettingsChooser.jsx index 3774bb11..2daf9965 100644 --- a/renderer/components/learning/modalSettingsChooser.jsx +++ b/renderer/components/learning/modalSettingsChooser.jsx @@ -4,6 +4,7 @@ import Button from "react-bootstrap/Button" import CheckOption from "./checkOption" import { useState, useEffect } from "react" import { FlowFunctionsContext } from "../flow/context/flowFunctionsContext" +import { Message } from "primereact/message" /** * @@ -45,9 +46,11 @@ const ModalSettingsChooser = ({ show, onHide, options, id, data }) => { {data.setupParam.title + " options"} {/* Display all the options available for the node */} - + {Object.entries(options).map(([optionName, optionInfos], i) => { - return + return ( + + ) })} diff --git a/renderer/components/learning/workflow.jsx b/renderer/components/learning/workflow.jsx index 7956e607..aaa2a995 100644 --- a/renderer/components/learning/workflow.jsx +++ b/renderer/components/learning/workflow.jsx @@ -282,9 +282,10 @@ const Workflow = ({ setWorkflowType, workflowType }) => { // recursively create tree from nodes const createTreeFromNodesRec = (node) => { let children = {} - + // for each edge, we check if the source node is the current node edges.forEach((edge) => { if (edge.source == node.id) { + // we find the target node associated with the edge let targetNode = deepCopy(nodes.find((node) => node.id === edge.target)) if (targetNode.type != "groupNode") { let subIdText = "" @@ -702,6 +703,21 @@ const Workflow = ({ setWorkflowType, workflowType }) => { hasModels = true } + if (nodeType == "group_models") { + edgesCopy = edgesCopy.filter((edge) => edge.target == currentNode.id) + console.log("edgesCopy", edgesCopy) + edgesCopy = edgesCopy.reduce((acc, edge) => { + if (edge.target == currentNode.id) { + let sourceNode = nodes.find((node) => node.id == edge.source) + if (sourceNode.data.setupParam.output.includes("model")) { + acc.push(edge) + } + } + return acc + }, []) + hasModels = true + } + // check if node has default values isValidDefault = isValidDefault && checkDefaultValues(currentNode) @@ -709,14 +725,13 @@ const Workflow = ({ setWorkflowType, workflowType }) => { if (node[key].nodes != {}) { // if this is a create model node, we need to add n pipelines if (hasModels) { - edgesCopy.forEach((edge) => { - let id = key + "*" + edge.source - if (key != up2Id) { - children[id] = cleanTreeDataRec(node[key].nodes) - } else { - children[id] = {} - } - }) + let allEdgesSourceIds = edgesCopy.map((edge) => edge.source).join(".") + let id = key + "*" + allEdgesSourceIds + if (key != up2Id) { + children[id] = cleanTreeDataRec(node[key].nodes) + } else { + children[id] = {} + } // if this is not a create model node, we continue normally } else { if (key != up2Id) { diff --git a/renderer/public/icon/learning/group_models.png b/renderer/public/icon/learning/group_models.png new file mode 100644 index 00000000..2f8da4f0 Binary files /dev/null and b/renderer/public/icon/learning/group_models.png differ diff --git a/renderer/public/setupVariables/learningNodesParams.jsx b/renderer/public/setupVariables/learningNodesParams.jsx index bc1072f2..0c70e3f8 100644 --- a/renderer/public/setupVariables/learningNodesParams.jsx +++ b/renderer/public/setupVariables/learningNodesParams.jsx @@ -69,6 +69,17 @@ const nodesParams = { title: "Compare models", possibleSettings: { classification: classificationSettings["compare_models"], regression: regressionSettings["compare_models"] } }, + group_models: { + type: "standardNode", + classes: "action group_models", + nbInput: 1, + nbOutput: 1, + input: ["model"], + output: ["model"], + img: "group_models.png", + title: "Group models", + possibleSettings: { classification: classificationSettings["group_models"], regression: regressionSettings["group_models"] } + }, load_model: { type: "loadModelNode", classes: "action load_model run", diff --git a/renderer/public/setupVariables/possibleSettings/learning/classificationSettings.js b/renderer/public/setupVariables/possibleSettings/learning/classificationSettings.js index 46a1a262..5e16101a 100644 --- a/renderer/public/setupVariables/possibleSettings/learning/classificationSettings.js +++ b/renderer/public/setupVariables/possibleSettings/learning/classificationSettings.js @@ -1,1118 +1,1208 @@ /* eslint-disable */ const classificationSettings = { - "clean": { - "options": { - "imputation_type": { - "type": "string", - "tooltip": "

The type of imputation to use. Can be either \u2018simple\u2019 or \u2018iterative\u2019.\nIf None, no imputation of missing values is performed.

\n", - "default_val": "simple" - }, - "normalize": { - "type": "bool", - "tooltip": "

When set to True, it transforms the features by scaling them to a given\nrange. Type of scaling is defined by the normalize_method parameter.

\n", - "default_val": "False" - }, - "normalize_method": { - "type": "string", - "tooltip": "

Defines the method for scaling. By default, normalize method is set to \u2018zscore\u2019\nThe standard zscore is calculated as z = (x - u) / s. Ignored when normalize\nis not True. The other options are:

\n
    \n
  • minmax: scales and translates each feature individually such that it is in

  • \n
\n

the range of 0 - 1.\n- maxabs: scales and translates each feature individually such that the\nmaximal absolute value of each feature will be 1.0. It does not\nshift/center the data, and thus does not destroy any sparsity.\n- robust: scales and translates each feature according to the Interquartile\nrange. When the dataset contains outliers, robust scaler often gives\nbetter results.

\n", - "default_val": "zscore" - }, - "iterative_imputation_iters": { - "type": "int", - "tooltip": "

Number of iterations. Ignored when imputation_type=simple.

\n", - "default_val": "5" - }, - "categorical_imputation": { - "type": "string", - "tooltip": "

Imputing strategy for categorical columns. Ignored when imputation_type= iterative. Choose from:

\n
\n
    \n
  • \u201cdrop\u201d: Drop rows containing missing values.

  • \n
  • \u201cmode\u201d: Impute with most frequent value.

  • \n
  • str: Impute with provided string.

  • \n
\n
\n", - "default_val": "mode" - }, - "categorical_iterative_imputer": { - "type": "string", - "tooltip": "

Regressor for iterative imputation of missing values in categorical features.\nIf None, it uses LGBClassifier. Ignored when imputation_type=simple.

\n", - "default_val": "lightgbm" - }, - "numeric_imputation": { - "type": "list", - "tooltip": "

Imputing strategy for numerical columns. Ignored when imputation_type= iterative. Choose from:

\n
\n
    \n
  • \u201cdrop\u201d: Drop rows containing missing values.

  • \n
  • \u201cmean\u201d: Impute with mean of column.

  • \n
  • \u201cmedian\u201d: Impute with median of column.

  • \n
  • \u201cmode\u201d: Impute with most frequent value.

  • \n
  • \u201cknn\u201d: Impute using a K-Nearest Neighbors approach.

  • \n
  • int or float: Impute with provided numerical value.

  • \n
\n
\n", - "default_val": "mean", - "choices": { - "drop": "Drop rows containing missing values", - "mean": "Impute with mean of column", - "median": "Impute with median of column", - "mode": "Impute with most frequent value", - "knn": "Impute using a K-Nearest Neighbors approach" - } - }, - "numeric_iterative_imputer": { - "type": "string", - "tooltip": "

Regressor for iterative imputation of missing values in numeric features.\nIf None, it uses LGBClassifier. Ignored when imputation_type=simple.

\n", - "default_val": "lightgbm" - }, - "transformation": { - "type": "bool", - "tooltip": "

When set to True, it applies the power transform to make data more Gaussian-like.\nType of transformation is defined by the transformation_method parameter.

\n", - "default_val": "False" - }, - "transformation_method": { - "type": "string", - "tooltip": "

Defines the method for transformation. By default, the transformation method is\nset to \u2018yeo-johnson\u2019. The other available option for transformation is \u2018quantile\u2019.\nIgnored when transformation is not True.

\n", - "default_val": "yeo-johnson" - }, - "pca": { - "type": "bool", - "tooltip": "

When set to True, dimensionality reduction is applied to project the data into\na lower dimensional space using the method defined in pca_method parameter.

\n", - "default_val": "False" - }, - "pca_method": { - "type": "string", - "tooltip": "
\n
Method with which to apply PCA. Possible values are:
    \n
  • \u2018linear\u2019: Uses Singular Value Decomposition.

  • \n
  • \u2018kernel\u2019: Dimensionality reduction through the use of RBF kernel.

  • \n
  • \u2018incremental\u2019: Similar to \u2018linear\u2019, but more efficient for large datasets.

  • \n
\n
\n
\n", - "default_val": "linear" - }, - "pca_components": { - "type": "int-float-str", - "tooltip": "
\n
Number of components to keep. This parameter is ignored when pca=False.
    \n
  • If None: All components are kept.

  • \n
  • If int: Absolute number of components.

  • \n
  • \n
    If float: Such an amount that the variance that needs to be explained

    is greater than the percentage specified by n_components.\nValue should lie between 0 and 1 (ony for pca_method=\u2019linear\u2019).

    \n
    \n
    \n
  • \n
  • If \u201cmle\u201d: Minka\u2019s MLE is used to guess the dimension (ony for pca_method=\u2019linear\u2019).

  • \n
\n
\n
\n", - "default_val": "None" - }, - "remove_outliers": { - "type": "bool", - "tooltip": "

When set to True, outliers from the training data are removed using an\nIsolation Forest.

\n", - "default_val": "False" - }, - "outliers_threshold": { - "type": "float", - "tooltip": "

The percentage of outliers to be removed from the dataset. Ignored\nwhen remove_outliers=False.

\n", - "default_val": "0.05" - }, - "remove_multicollinearity": { - "type": "bool", - "tooltip": "

When set to True, features with the inter-correlations higher than\nthe defined threshold are removed. For each group, it removes all\nexcept the feature with the highest correlation to y.

\n", - "default_val": "False" - }, - "multicollinearity_threshold": { - "type": "float", - "tooltip": "

Minimum absolute Pearson correlation to identify correlated\nfeatures. The default value removes equal columns. Ignored when\nremove_multicollinearity is not True.

\n", - "default_val": "0.9" - }, - "polynomial_features": { - "type": "bool", - "tooltip": "

When set to True, new features are derived using existing numeric features.

\n", - "default_val": "False" - }, - "polynomial_degree": { - "type": "int", - "tooltip": "

Degree of polynomial features. For example, if an input sample is two dimensional\nand of the form [a, b], the polynomial features with degree = 2 are:\n[1, a, b, a^2, ab, b^2]. Ignored when polynomial_features is not True.

\n", - "default_val": "2" - }, - "feature_selection": { - "type": "bool", - "tooltip": "

When set to True, a subset of features is selected based on a feature\nimportance score determined by feature_selection_estimator.

\n", - "default_val": "False" - }, - "feature_selection_estimator": { - "type": "string", - "tooltip": "

Classifier used to determine the feature importances. The\nestimator should have a feature_importances_ or coef_\nattribute after fitting. If None, it uses LGBClassifier. This\nparameter is ignored when feature_selection_method=univariate.

\n", - "default_val": "lightgbm" - }, - "feature_selection_method": { - "type": "string", - "tooltip": "
\n
Algorithm for feature selection. Choose from:
    \n
  • \u2018univariate\u2019: Uses sklearn\u2019s SelectKBest.

  • \n
  • \u2018classic\u2019: Uses sklearn\u2019s SelectFromModel.

  • \n
  • \u2018sequential\u2019: Uses sklearn\u2019s SequentialFeatureSelector.

  • \n
\n
\n
\n", - "default_val": "classic" - }, - "n_features_to_select": { - "type": "float", - "tooltip": "

The maximum number of features to select with feature_selection. If <1,\nit\u2019s the fraction of starting features. Note that this parameter doesn\u2019t\ntake features in ignore_features or keep_features into account\nwhen counting.

\n", - "default_val": "0.2" - } - }, - "code": "" + clean: { + options: { + imputation_type: { + type: "string", + tooltip: "

The type of imputation to use. Can be either \u2018simple\u2019 or \u2018iterative\u2019.\nIf None, no imputation of missing values is performed.

\n", + default_val: "simple" + }, + normalize: { + type: "bool", + tooltip: "

When set to True, it transforms the features by scaling them to a given\nrange. Type of scaling is defined by the normalize_method parameter.

\n", + default_val: "False" + }, + normalize_method: { + type: "string", + tooltip: + "

Defines the method for scaling. By default, normalize method is set to \u2018zscore\u2019\nThe standard zscore is calculated as z = (x - u) / s. Ignored when normalize\nis not True. The other options are:

\n
    \n
  • minmax: scales and translates each feature individually such that it is in

  • \n
\n

the range of 0 - 1.\n- maxabs: scales and translates each feature individually such that the\nmaximal absolute value of each feature will be 1.0. It does not\nshift/center the data, and thus does not destroy any sparsity.\n- robust: scales and translates each feature according to the Interquartile\nrange. When the dataset contains outliers, robust scaler often gives\nbetter results.

\n", + default_val: "zscore" + }, + iterative_imputation_iters: { + type: "int", + tooltip: "

Number of iterations. Ignored when imputation_type=simple.

\n", + default_val: "5" + }, + categorical_imputation: { + type: "string", + tooltip: + "

Imputing strategy for categorical columns. Ignored when imputation_type= iterative. Choose from:

\n
\n
    \n
  • \u201cdrop\u201d: Drop rows containing missing values.

  • \n
  • \u201cmode\u201d: Impute with most frequent value.

  • \n
  • str: Impute with provided string.

  • \n
\n
\n", + default_val: "mode" + }, + categorical_iterative_imputer: { + type: "string", + tooltip: "

Regressor for iterative imputation of missing values in categorical features.\nIf None, it uses LGBClassifier. Ignored when imputation_type=simple.

\n", + default_val: "lightgbm" + }, + numeric_imputation: { + type: "list", + tooltip: + "

Imputing strategy for numerical columns. Ignored when imputation_type= iterative. Choose from:

\n
\n
    \n
  • \u201cdrop\u201d: Drop rows containing missing values.

  • \n
  • \u201cmean\u201d: Impute with mean of column.

  • \n
  • \u201cmedian\u201d: Impute with median of column.

  • \n
  • \u201cmode\u201d: Impute with most frequent value.

  • \n
  • \u201cknn\u201d: Impute using a K-Nearest Neighbors approach.

  • \n
  • int or float: Impute with provided numerical value.

  • \n
\n
\n", + default_val: "mean", + choices: { + drop: "Drop rows containing missing values", + mean: "Impute with mean of column", + median: "Impute with median of column", + mode: "Impute with most frequent value", + knn: "Impute using a K-Nearest Neighbors approach" + } + }, + numeric_iterative_imputer: { + type: "string", + tooltip: "

Regressor for iterative imputation of missing values in numeric features.\nIf None, it uses LGBClassifier. Ignored when imputation_type=simple.

\n", + default_val: "lightgbm" + }, + transformation: { + type: "bool", + tooltip: "

When set to True, it applies the power transform to make data more Gaussian-like.\nType of transformation is defined by the transformation_method parameter.

\n", + default_val: "False" + }, + transformation_method: { + type: "string", + tooltip: + "

Defines the method for transformation. By default, the transformation method is\nset to \u2018yeo-johnson\u2019. The other available option for transformation is \u2018quantile\u2019.\nIgnored when transformation is not True.

\n", + default_val: "yeo-johnson" + }, + pca: { + type: "bool", + tooltip: "

When set to True, dimensionality reduction is applied to project the data into\na lower dimensional space using the method defined in pca_method parameter.

\n", + default_val: "False" + }, + pca_method: { + type: "string", + tooltip: + "
\n
Method with which to apply PCA. Possible values are:
    \n
  • \u2018linear\u2019: Uses Singular Value Decomposition.

  • \n
  • \u2018kernel\u2019: Dimensionality reduction through the use of RBF kernel.

  • \n
  • \u2018incremental\u2019: Similar to \u2018linear\u2019, but more efficient for large datasets.

  • \n
\n
\n
\n", + default_val: "linear" + }, + pca_components: { + type: "int-float-str", + tooltip: + "
\n
Number of components to keep. This parameter is ignored when pca=False.
    \n
  • If None: All components are kept.

  • \n
  • If int: Absolute number of components.

  • \n
  • \n
    If float: Such an amount that the variance that needs to be explained

    is greater than the percentage specified by n_components.\nValue should lie between 0 and 1 (ony for pca_method=\u2019linear\u2019).

    \n
    \n
    \n
  • \n
  • If \u201cmle\u201d: Minka\u2019s MLE is used to guess the dimension (ony for pca_method=\u2019linear\u2019).

  • \n
\n
\n
\n", + default_val: "None" + }, + remove_outliers: { + type: "bool", + tooltip: "

When set to True, outliers from the training data are removed using an\nIsolation Forest.

\n", + default_val: "False" + }, + outliers_threshold: { + type: "float", + tooltip: "

The percentage of outliers to be removed from the dataset. Ignored\nwhen remove_outliers=False.

\n", + default_val: "0.05" + }, + remove_multicollinearity: { + type: "bool", + tooltip: + "

When set to True, features with the inter-correlations higher than\nthe defined threshold are removed. For each group, it removes all\nexcept the feature with the highest correlation to y.

\n", + default_val: "False" + }, + multicollinearity_threshold: { + type: "float", + tooltip: "

Minimum absolute Pearson correlation to identify correlated\nfeatures. The default value removes equal columns. Ignored when\nremove_multicollinearity is not True.

\n", + default_val: "0.9" + }, + polynomial_features: { + type: "bool", + tooltip: "

When set to True, new features are derived using existing numeric features.

\n", + default_val: "False" + }, + polynomial_degree: { + type: "int", + tooltip: + "

Degree of polynomial features. For example, if an input sample is two dimensional\nand of the form [a, b], the polynomial features with degree = 2 are:\n[1, a, b, a^2, ab, b^2]. Ignored when polynomial_features is not True.

\n", + default_val: "2" + }, + feature_selection: { + type: "bool", + tooltip: "

When set to True, a subset of features is selected based on a feature\nimportance score determined by feature_selection_estimator.

\n", + default_val: "False" + }, + feature_selection_estimator: { + type: "string", + tooltip: + "

Classifier used to determine the feature importances. The\nestimator should have a feature_importances_ or coef_\nattribute after fitting. If None, it uses LGBClassifier. This\nparameter is ignored when feature_selection_method=univariate.

\n", + default_val: "lightgbm" + }, + feature_selection_method: { + type: "string", + tooltip: + "
\n
Algorithm for feature selection. Choose from:
    \n
  • \u2018univariate\u2019: Uses sklearn\u2019s SelectKBest.

  • \n
  • \u2018classic\u2019: Uses sklearn\u2019s SelectFromModel.

  • \n
  • \u2018sequential\u2019: Uses sklearn\u2019s SequentialFeatureSelector.

  • \n
\n
\n
\n", + default_val: "classic" + }, + n_features_to_select: { + type: "float", + tooltip: + "

The maximum number of features to select with feature_selection. If <1,\nit\u2019s the fraction of starting features. Note that this parameter doesn\u2019t\ntake features in ignore_features or keep_features into account\nwhen counting.

\n", + default_val: "0.2" + } }, - "dataset": { - "options": { - "data_func": { - "type": "data-function", - "tooltip": "

The function that generate data (the dataframe-like input). This\nis useful when the dataset is large, and you need parallel operations\nsuch as compare_models. It can avoid broadcasting large dataset\nfrom driver to workers. Notice one and only one of data and\ndata_func must be set.

\n", - "default_val": "" - }, - "index": { - "type": "bool-int-str", - "tooltip": "
\n
Handle indices in the data dataframe.
    \n
  • If False: Reset to RangeIndex.

  • \n
  • If True: Keep the provided index.

  • \n
  • If int: Position of the column to use as index.

  • \n
  • If str: Name of the column to use as index.

  • \n
  • If sequence: Array with shape=(n_samples,) to use as index.

  • \n
\n
\n
\n", - "default_val": "True" - }, - "train_size": { - "type": "float", - "tooltip": "

Proportion of the dataset to be used for training and validation. Should be\nbetween 0.0 and 1.0.

\n", - "default_val": "0.7" - }, - "test_data": { - "type": "dataframe", - "tooltip": "

If not None, test_data is used as a hold-out set and train_size parameter\nis ignored. The columns of data and test_data must match.

\n", - "default_val": "None" - }, - "ordinal_features": { - "type": "dict", - "tooltip": "

Categorical features to be encoded ordinally. For example, a categorical\nfeature with \u2018low\u2019, \u2018medium\u2019, \u2018high\u2019 values where low < medium < high can\nbe passed as ordinal_features = {\u2018column_name\u2019 : [\u2018low\u2019, \u2018medium\u2019, \u2018high\u2019]}.

\n", - "default_val": "None" - }, - "numeric_features": { - "type": "custom-list", - "tooltip": "

If the inferred data types are not correct, the numeric_features param can\nbe used to define the data types. It takes a list of strings with column\nnames that are numeric.

\n", - "default_val": "None" - }, - "categorical_features": { - "type": "custom-list", - "tooltip": "

If the inferred data types are not correct, the categorical_features param\ncan be used to define the data types. It takes a list of strings with column\nnames that are categorical.

\n", - "default_val": "None" - }, - "date_features": { - "type": "custom-list", - "tooltip": "

If the inferred data types are not correct, the date_features param can be\nused to overwrite the data types. It takes a list of strings with column\nnames that are DateTime.

\n", - "default_val": "None" - }, - "text_features": { - "type": "custom-list", - "tooltip": "

Column names that contain a text corpus. If None, no text features are\nselected.

\n", - "default_val": "None" - }, - "ignore_features": { - "type": "custom-list", - "tooltip": "

ignore_features param can be used to ignore features during preprocessing\nand model training. It takes a list of strings with column names that are\nto be ignored.

\n", - "default_val": "None" - }, - "keep_features": { - "type": "custom-list", - "tooltip": "

keep_features param can be used to always keep specific features during\npreprocessing, i.e. these features are never dropped by any kind of\nfeature selection. It takes a list of strings with column names that are\nto be kept.

\n", - "default_val": "None" - }, - "preprocess": { - "type": "bool", - "tooltip": "

When set to False, no transformations are applied except for train_test_split\nand custom transformations passed in custom_pipeline param. Data must be\nready for modeling (no missing values, no dates, categorical data encoding),\nwhen preprocess is set to False.

\n", - "default_val": "True" - }, - "create_date_columns": { - "type": "custom-list", - "tooltip": "

Columns to create from the date features. Note that created features\nwith zero variance (e.g. the feature hour in a column that only contains\ndates) are ignored. Allowed values are datetime attributes from\npandas.Series.dt. The datetime format of the feature is inferred\nautomatically from the first non NaN value.

\n", - "default_val": "[\u201cday\u201d, \u201cmonth\u201d, \u201cyear\u201d]" - }, - "text_features_method": { - "type": "string", - "tooltip": "

Method with which to embed the text features in the dataset. Choose\nbetween \u201cbow\u201d (Bag of Words - CountVectorizer) or \u201ctf-idf\u201d (TfidfVectorizer).\nBe aware that the sparse matrix output of the transformer is converted\ninternally to its full array. This can cause memory issues for large\ntext embeddings.

\n", - "default_val": "\u201ctf-idf\u201d" - }, - "max_encoding_ohe": { - "type": "int", - "tooltip": "

Categorical columns with max_encoding_ohe or less unique values are\nencoded using OneHotEncoding. If more, the encoding_method estimator\nis used. Note that columns with exactly two classes are always encoded\nordinally. Set to below 0 to always use OneHotEncoding.

\n", - "default_val": "25" - }, - "encoding_method": { - "type": "category-encoders estimator", - "tooltip": "

A category-encoders estimator to encode the categorical columns\nwith more than max_encoding_ohe unique values. If None,\ncategory_encoders.target_encoder.TargetEncoder is used.

\n", - "default_val": "None" - }, - "rare_to_value": { - "type": "float", - "tooltip": "

Minimum fraction of category occurrences in a categorical column.\nIf a category is less frequent than rare_to_value * len(X), it is\nreplaced with the string in rare_value. Use this parameter to group\nrare categories before encoding the column. If None, ignores this step.

\n", - "default_val": "one" - }, - "rare_value": { - "type": "string", - "tooltip": "

Value with which to replace rare categories. Ignored when\nrare_to_value is None.

\n", - "default_val": "rare\u201d" - }, - "low_variance_threshold": { - "type": "float", - "tooltip": "

Remove features with a training-set variance lower than the provided\nthreshold. If 0, keep all features with non-zero variance, i.e. remove\nthe features that have the same value in all samples. If None, skip\nthis transformation step.

\n", - "default_val": "None" - }, - "group_features": { - "type": "dict", - "tooltip": "

When the dataset contains features with related characteristics,\nadd new fetaures with the following statistical properties of that\ngroup: min, max, mean, std, median and mode. The parameter takes a\ndict with the group name as key and a list of feature names\nbelonging to that group as value.

\n", - "default_val": "None" - }, - "drop_groups": { - "type": "bool", - "tooltip": "

Whether to drop the original features in the group. Ignored when\ngroup_features is None.

\n", - "default_val": "alse" - }, - "bin_numeric_features": { - "type": "custom-list", - "tooltip": "

To convert numeric features into categorical, bin_numeric_features parameter can\nbe used. It takes a list of strings with column names to be discretized. It does\nso by using \u2018sturges\u2019 rule to determine the number of clusters and then apply\nKMeans algorithm. Original values of the feature are then replaced by the\ncluster label.

\n", - "default_val": "None" - }, - "outliers_method": { - "type": "string", - "tooltip": "

Method with which to remove outliers. Ignored when remove_outliers=False.\nPossible values are:

\n
\n
    \n
  • \u2018iforest\u2019: Uses sklearn\u2019s IsolationForest.

  • \n
  • \u2018ee\u2019: Uses sklearn\u2019s EllipticEnvelope.

  • \n
  • \u2018lof\u2019: Uses sklearn\u2019s LocalOutlierFactor.

  • \n
\n
\n", - "default_val": "\u201ciforest\u201d" - }, - "fix_imbalance": { - "type": "bool", - "tooltip": "

When training dataset has unequal distribution of target class it can be balanced\nusing this parameter. When set to True, SMOTE (Synthetic Minority Over-sampling\nTechnique) is applied by default to create synthetic datapoints for minority class.

\n", - "default_val": "False" - }, - "fix_imbalance_method": { - "type": "string", - "tooltip": "

Estimator with which to perform class balancing. Choose from the name\nof an imblearn estimator, or a custom instance of such. Ignored when\nfix_imbalance=False.

\n", - "default_val": "\u201cSMOTE\u201d" - }, - "custom_pipeline": { - "type": "list of (str, transformer), dict or Pipeline", - "tooltip": "

Addidiotnal custom transformers. If passed, they are applied to the\npipeline last, after all the build-in transformers.

\n", - "default_val": "None" - }, - "custom_pipeline_position": { - "type": "int", - "tooltip": "

Position of the custom pipeline in the overal preprocessing pipeline.\nThe default value adds the custom pipeline last.

\n", - "default_val": "-1" - }, - "data_split_shuffle": { - "type": "bool", - "tooltip": "

When set to False, prevents shuffling of rows during \u2018train_test_split\u2019.

\n", - "default_val": "True" - }, - "data_split_stratify": { - "type": "bool", - "tooltip": "

Controls stratification during \u2018train_test_split\u2019. When set to True, will\nstratify by target column. To stratify on any other columns, pass a list of\ncolumn names. Ignored when data_split_shuffle is False.

\n", - "default_val": "True" - }, - "fold_strategy": { - "type": "string", - "tooltip": "

Choice of cross validation strategy. Possible values are:

\n
    \n
  • \u2018kfold\u2019

  • \n
  • \u2018stratifiedkfold\u2019

  • \n
  • \u2018groupkfold\u2019

  • \n
  • \u2018timeseries\u2019

  • \n
  • a custom CV generator object compatible with scikit-learn.

  • \n
\n

For groupkfold, column name must be passed in fold_groups parameter.\nExample: setup(fold_strategy=\"groupkfold\", fold_groups=\"COLUMN_NAME\")

\n", - "default_val": "stratifiedkfold" - }, - "fold": { - "type": "int", - "tooltip": "

Number of folds to be used in cross validation. Must be at least 2. This is\na global setting that can be over-written at function level by using fold\nparameter. Ignored when fold_strategy is a custom object.

\n", - "default_val": "10" - }, - "fold_shuffle": { - "type": "bool", - "tooltip": "

Controls the shuffle parameter of CV. Only applicable when fold_strategy\nis \u2018kfold\u2019 or \u2018stratifiedkfold\u2019. Ignored when fold_strategy is a custom\nobject.

\n", - "default_val": "False" - }, - "fold_groups": { - "type": "string", - "tooltip": "

Optional group labels when \u2018GroupKFold\u2019 is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in the training dataset. When string is passed, it is interpreted\nas the column name in the dataset containing group labels.

\n", - "default_val": "None" - }, - "n_jobs": { - "type": "int", - "tooltip": "

The number of jobs to run in parallel (for functions that supports parallel\nprocessing) -1 means using all processors. To run all functions on single\nprocessor set n_jobs to None.

\n", - "default_val": "-1" - }, - "use_gpu": { - "type": "list", - "tooltip": "

When set to True, it will use GPU for training with algorithms that support it,\nand fall back to CPU if they are unavailable. When set to \u2018force\u2019, it will only\nuse GPU-enabled algorithms and raise exceptions when they are unavailable. When\nFalse, all algorithms are trained using CPU only.

\n

GPU enabled algorithms:

\n
    \n
  • Extreme Gradient Boosting, requires no further installation

  • \n
  • CatBoost Classifier, requires no further installation

  • \n
\n

(GPU is only enabled when data > 50,000 rows)

\n
    \n
  • Light Gradient Boosting Machine, requires GPU installation

  • \n
\n

https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html

\n
    \n
  • Logistic Regression, Ridge Classifier, Random Forest, K Neighbors Classifier,

  • \n
\n

Support Vector Machine, requires cuML >= 0.15\nhttps://github.com/rapidsai/cuml

\n", - "default_val": "False", - "choices": { - "False": "tooltip False", - "True": "tooltip True", - "force": "tooltip force" - } - }, - "html": { - "type": "bool", - "tooltip": "

When set to False, prevents runtime display of monitor. This must be set to False\nwhen the environment does not support IPython. For example, command line terminal,\nDatabricks Notebook, Spyder and other similar IDEs.

\n", - "default_val": "True" - }, - "session_id": { - "type": "int", - "tooltip": "

Controls the randomness of experiment. It is equivalent to \u2018random_state\u2019 in\nscikit-learn. When None, a pseudo random number is generated. This can be used\nfor later reproducibility of the entire experiment.

\n", - "default_val": "None" - }, - "experiment_name": { - "type": "string", - "tooltip": "

Name of the experiment for logging. Ignored when log_experiment is False.

\n", - "default_val": "None" - }, - "experiment_custom_tags": { - "type": "dict", - "tooltip": "

Dictionary of tag_name: String -> value: (String, but will be string-ified\nif not) passed to the mlflow.set_tags to add new custom tags for the experiment.

\n", - "default_val": "None" - }, - "log_plots": { - "type": "bool", - "tooltip": "

When set to True, certain plots are logged automatically in the MLFlow server.\nTo change the type of plots to be logged, pass a list containing plot IDs. Refer\nto documentation of plot_model. Ignored when log_experiment is False.

\n", - "default_val": "False" - }, - "log_profile": { - "type": "bool", - "tooltip": "

When set to True, data profile is logged on the MLflow server as a html file.\nIgnored when log_experiment is False.

\n", - "default_val": "False" - }, - "log_data": { - "type": "bool", - "tooltip": "

When set to True, dataset is logged on the MLflow server as a csv file.\nIgnored when log_experiment is False.

\n", - "default_val": "False" - }, - "engine": { - "type": "Optional[Dict[str, str]] = None", - "tooltip": "

The execution engines to use for the models in the form of a dict\nof model_id: engine - e.g. for Logistic Regression (\u201clr\u201d, users can\nswitch between \u201csklearn\u201d and \u201csklearnex\u201d by specifying\nengine={\u201clr\u201d: \u201csklearnex\u201d}

\n", - "default_val": "" - }, - "verbose": { - "type": "bool", - "tooltip": "

When set to False, Information grid is not printed.

\n", - "default_val": "True" - }, - "memory": { - "type": "str, bool or Memory", - "tooltip": "
\n
Used to cache the fitted transformers of the pipeline.

If False: No caching is performed.\nIf True: A default temp directory is used.\nIf str: Path to the caching directory.

\n
\n
\n", - "default_val": "rue" - }, - "profile": { - "type": "bool", - "tooltip": "

When set to True, an interactive EDA report is displayed.

\n", - "default_val": "False" - }, - "profile_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the ProfileReport method used\nto create the EDA report. Ignored if profile is False.

\n", - "default_val": "{} (empty dict)" - }, - "time-point": { - "type": "string", - "default_val": "", - "tooltip": "

Time point relative to where analysis is performed

" - }, - "split_experiment_by_institutions": { - "type": "bool", - "default_val": "False", - "tooltip": "

Set this to true for analysis by institutions

" - } - }, - "code": "", - "default": { - "files": { - "type": "data-input", - "tooltip": "

Specify path to csv file or to medomics folder

" - } + code: "" + }, + dataset: { + options: { + data_func: { + type: "data-function", + tooltip: + "

The function that generate data (the dataframe-like input). This\nis useful when the dataset is large, and you need parallel operations\nsuch as compare_models. It can avoid broadcasting large dataset\nfrom driver to workers. Notice one and only one of data and\ndata_func must be set.

\n", + default_val: "" + }, + index: { + type: "bool-int-str", + tooltip: + "
\n
Handle indices in the data dataframe.
    \n
  • If False: Reset to RangeIndex.

  • \n
  • If True: Keep the provided index.

  • \n
  • If int: Position of the column to use as index.

  • \n
  • If str: Name of the column to use as index.

  • \n
  • If sequence: Array with shape=(n_samples,) to use as index.

  • \n
\n
\n
\n", + default_val: "True" + }, + train_size: { + type: "float", + tooltip: "

Proportion of the dataset to be used for training and validation. Should be\nbetween 0.0 and 1.0.

\n", + default_val: "0.7" + }, + test_data: { + type: "dataframe", + tooltip: "

If not None, test_data is used as a hold-out set and train_size parameter\nis ignored. The columns of data and test_data must match.

\n", + default_val: "None" + }, + ordinal_features: { + type: "dict", + tooltip: + "

Categorical features to be encoded ordinally. For example, a categorical\nfeature with \u2018low\u2019, \u2018medium\u2019, \u2018high\u2019 values where low < medium < high can\nbe passed as ordinal_features = {\u2018column_name\u2019 : [\u2018low\u2019, \u2018medium\u2019, \u2018high\u2019]}.

\n", + default_val: "None" + }, + numeric_features: { + type: "custom-list", + tooltip: + "

If the inferred data types are not correct, the numeric_features param can\nbe used to define the data types. It takes a list of strings with column\nnames that are numeric.

\n", + default_val: "None" + }, + categorical_features: { + type: "custom-list", + tooltip: + "

If the inferred data types are not correct, the categorical_features param\ncan be used to define the data types. It takes a list of strings with column\nnames that are categorical.

\n", + default_val: "None" + }, + date_features: { + type: "custom-list", + tooltip: + "

If the inferred data types are not correct, the date_features param can be\nused to overwrite the data types. It takes a list of strings with column\nnames that are DateTime.

\n", + default_val: "None" + }, + text_features: { + type: "custom-list", + tooltip: "

Column names that contain a text corpus. If None, no text features are\nselected.

\n", + default_val: "None" + }, + ignore_features: { + type: "custom-list", + tooltip: "

ignore_features param can be used to ignore features during preprocessing\nand model training. It takes a list of strings with column names that are\nto be ignored.

\n", + default_val: "None" + }, + keep_features: { + type: "custom-list", + tooltip: + "

keep_features param can be used to always keep specific features during\npreprocessing, i.e. these features are never dropped by any kind of\nfeature selection. It takes a list of strings with column names that are\nto be kept.

\n", + default_val: "None" + }, + preprocess: { + type: "bool", + tooltip: + "

When set to False, no transformations are applied except for train_test_split\nand custom transformations passed in custom_pipeline param. Data must be\nready for modeling (no missing values, no dates, categorical data encoding),\nwhen preprocess is set to False.

\n", + default_val: "True" + }, + create_date_columns: { + type: "custom-list", + tooltip: + "

Columns to create from the date features. Note that created features\nwith zero variance (e.g. the feature hour in a column that only contains\ndates) are ignored. Allowed values are datetime attributes from\npandas.Series.dt. The datetime format of the feature is inferred\nautomatically from the first non NaN value.

\n", + default_val: "[\u201cday\u201d, \u201cmonth\u201d, \u201cyear\u201d]" + }, + text_features_method: { + type: "string", + tooltip: + "

Method with which to embed the text features in the dataset. Choose\nbetween \u201cbow\u201d (Bag of Words - CountVectorizer) or \u201ctf-idf\u201d (TfidfVectorizer).\nBe aware that the sparse matrix output of the transformer is converted\ninternally to its full array. This can cause memory issues for large\ntext embeddings.

\n", + default_val: "\u201ctf-idf\u201d" + }, + max_encoding_ohe: { + type: "int", + tooltip: + "

Categorical columns with max_encoding_ohe or less unique values are\nencoded using OneHotEncoding. If more, the encoding_method estimator\nis used. Note that columns with exactly two classes are always encoded\nordinally. Set to below 0 to always use OneHotEncoding.

\n", + default_val: "25" + }, + encoding_method: { + type: "category-encoders estimator", + tooltip: + "

A category-encoders estimator to encode the categorical columns\nwith more than max_encoding_ohe unique values. If None,\ncategory_encoders.target_encoder.TargetEncoder is used.

\n", + default_val: "None" + }, + rare_to_value: { + type: "float", + tooltip: + "

Minimum fraction of category occurrences in a categorical column.\nIf a category is less frequent than rare_to_value * len(X), it is\nreplaced with the string in rare_value. Use this parameter to group\nrare categories before encoding the column. If None, ignores this step.

\n", + default_val: "one" + }, + rare_value: { + type: "string", + tooltip: "

Value with which to replace rare categories. Ignored when\nrare_to_value is None.

\n", + default_val: "rare\u201d" + }, + low_variance_threshold: { + type: "float", + tooltip: + "

Remove features with a training-set variance lower than the provided\nthreshold. If 0, keep all features with non-zero variance, i.e. remove\nthe features that have the same value in all samples. If None, skip\nthis transformation step.

\n", + default_val: "None" + }, + group_features: { + type: "dict", + tooltip: + "

When the dataset contains features with related characteristics,\nadd new fetaures with the following statistical properties of that\ngroup: min, max, mean, std, median and mode. The parameter takes a\ndict with the group name as key and a list of feature names\nbelonging to that group as value.

\n", + default_val: "None" + }, + drop_groups: { + type: "bool", + tooltip: "

Whether to drop the original features in the group. Ignored when\ngroup_features is None.

\n", + default_val: "alse" + }, + bin_numeric_features: { + type: "custom-list", + tooltip: + "

To convert numeric features into categorical, bin_numeric_features parameter can\nbe used. It takes a list of strings with column names to be discretized. It does\nso by using \u2018sturges\u2019 rule to determine the number of clusters and then apply\nKMeans algorithm. Original values of the feature are then replaced by the\ncluster label.

\n", + default_val: "None" + }, + outliers_method: { + type: "string", + tooltip: + "

Method with which to remove outliers. Ignored when remove_outliers=False.\nPossible values are:

\n
\n
    \n
  • \u2018iforest\u2019: Uses sklearn\u2019s IsolationForest.

  • \n
  • \u2018ee\u2019: Uses sklearn\u2019s EllipticEnvelope.

  • \n
  • \u2018lof\u2019: Uses sklearn\u2019s LocalOutlierFactor.

  • \n
\n
\n", + default_val: "\u201ciforest\u201d" + }, + fix_imbalance: { + type: "bool", + tooltip: + "

When training dataset has unequal distribution of target class it can be balanced\nusing this parameter. When set to True, SMOTE (Synthetic Minority Over-sampling\nTechnique) is applied by default to create synthetic datapoints for minority class.

\n", + default_val: "False" + }, + fix_imbalance_method: { + type: "string", + tooltip: + "

Estimator with which to perform class balancing. Choose from the name\nof an imblearn estimator, or a custom instance of such. Ignored when\nfix_imbalance=False.

\n", + default_val: "\u201cSMOTE\u201d" + }, + custom_pipeline: { + type: "list of (str, transformer), dict or Pipeline", + tooltip: "

Addidiotnal custom transformers. If passed, they are applied to the\npipeline last, after all the build-in transformers.

\n", + default_val: "None" + }, + custom_pipeline_position: { + type: "int", + tooltip: "

Position of the custom pipeline in the overal preprocessing pipeline.\nThe default value adds the custom pipeline last.

\n", + default_val: "-1" + }, + data_split_shuffle: { + type: "bool", + tooltip: "

When set to False, prevents shuffling of rows during \u2018train_test_split\u2019.

\n", + default_val: "True" + }, + data_split_stratify: { + type: "bool", + tooltip: + "

Controls stratification during \u2018train_test_split\u2019. When set to True, will\nstratify by target column. To stratify on any other columns, pass a list of\ncolumn names. Ignored when data_split_shuffle is False.

\n", + default_val: "True" + }, + fold_strategy: { + type: "string", + tooltip: + '

Choice of cross validation strategy. Possible values are:

\n
    \n
  • \u2018kfold\u2019

  • \n
  • \u2018stratifiedkfold\u2019

  • \n
  • \u2018groupkfold\u2019

  • \n
  • \u2018timeseries\u2019

  • \n
  • a custom CV generator object compatible with scikit-learn.

  • \n
\n

For groupkfold, column name must be passed in fold_groups parameter.\nExample: setup(fold_strategy="groupkfold", fold_groups="COLUMN_NAME")

\n', + default_val: "stratifiedkfold" + }, + fold: { + type: "int", + tooltip: + "

Number of folds to be used in cross validation. Must be at least 2. This is\na global setting that can be over-written at function level by using fold\nparameter. Ignored when fold_strategy is a custom object.

\n", + default_val: "10" + }, + fold_shuffle: { + type: "bool", + tooltip: + "

Controls the shuffle parameter of CV. Only applicable when fold_strategy\nis \u2018kfold\u2019 or \u2018stratifiedkfold\u2019. Ignored when fold_strategy is a custom\nobject.

\n", + default_val: "False" + }, + fold_groups: { + type: "string", + tooltip: + "

Optional group labels when \u2018GroupKFold\u2019 is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in the training dataset. When string is passed, it is interpreted\nas the column name in the dataset containing group labels.

\n", + default_val: "None" + }, + n_jobs: { + type: "int", + tooltip: + "

The number of jobs to run in parallel (for functions that supports parallel\nprocessing) -1 means using all processors. To run all functions on single\nprocessor set n_jobs to None.

\n", + default_val: "-1" + }, + use_gpu: { + type: "list", + tooltip: + "

When set to True, it will use GPU for training with algorithms that support it,\nand fall back to CPU if they are unavailable. When set to \u2018force\u2019, it will only\nuse GPU-enabled algorithms and raise exceptions when they are unavailable. When\nFalse, all algorithms are trained using CPU only.

\n

GPU enabled algorithms:

\n
    \n
  • Extreme Gradient Boosting, requires no further installation

  • \n
  • CatBoost Classifier, requires no further installation

  • \n
\n

(GPU is only enabled when data > 50,000 rows)

\n
    \n
  • Light Gradient Boosting Machine, requires GPU installation

  • \n
\n

https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html

\n
    \n
  • Logistic Regression, Ridge Classifier, Random Forest, K Neighbors Classifier,

  • \n
\n

Support Vector Machine, requires cuML >= 0.15\nhttps://github.com/rapidsai/cuml

\n", + default_val: "False", + choices: { + False: "tooltip False", + True: "tooltip True", + force: "tooltip force" } + }, + html: { + type: "bool", + tooltip: + "

When set to False, prevents runtime display of monitor. This must be set to False\nwhen the environment does not support IPython. For example, command line terminal,\nDatabricks Notebook, Spyder and other similar IDEs.

\n", + default_val: "True" + }, + session_id: { + type: "int", + tooltip: + "

Controls the randomness of experiment. It is equivalent to \u2018random_state\u2019 in\nscikit-learn. When None, a pseudo random number is generated. This can be used\nfor later reproducibility of the entire experiment.

\n", + default_val: "None" + }, + experiment_name: { + type: "string", + tooltip: "

Name of the experiment for logging. Ignored when log_experiment is False.

\n", + default_val: "None" + }, + experiment_custom_tags: { + type: "dict", + tooltip: "

Dictionary of tag_name: String -> value: (String, but will be string-ified\nif not) passed to the mlflow.set_tags to add new custom tags for the experiment.

\n", + default_val: "None" + }, + log_plots: { + type: "bool", + tooltip: + "

When set to True, certain plots are logged automatically in the MLFlow server.\nTo change the type of plots to be logged, pass a list containing plot IDs. Refer\nto documentation of plot_model. Ignored when log_experiment is False.

\n", + default_val: "False" + }, + log_profile: { + type: "bool", + tooltip: "

When set to True, data profile is logged on the MLflow server as a html file.\nIgnored when log_experiment is False.

\n", + default_val: "False" + }, + log_data: { + type: "bool", + tooltip: "

When set to True, dataset is logged on the MLflow server as a csv file.\nIgnored when log_experiment is False.

\n", + default_val: "False" + }, + engine: { + type: "Optional[Dict[str, str]] = None", + tooltip: + "

The execution engines to use for the models in the form of a dict\nof model_id: engine - e.g. for Logistic Regression (\u201clr\u201d, users can\nswitch between \u201csklearn\u201d and \u201csklearnex\u201d by specifying\nengine={\u201clr\u201d: \u201csklearnex\u201d}

\n", + default_val: "" + }, + verbose: { + type: "bool", + tooltip: "

When set to False, Information grid is not printed.

\n", + default_val: "True" + }, + memory: { + type: "str, bool or Memory", + tooltip: + "
\n
Used to cache the fitted transformers of the pipeline.

If False: No caching is performed.\nIf True: A default temp directory is used.\nIf str: Path to the caching directory.

\n
\n
\n", + default_val: "rue" + }, + profile: { + type: "bool", + tooltip: "

When set to True, an interactive EDA report is displayed.

\n", + default_val: "False" + }, + profile_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the ProfileReport method used\nto create the EDA report. Ignored if profile is False.

\n", + default_val: "{} (empty dict)" + }, + "time-point": { + type: "string", + default_val: "", + tooltip: "

Time point relative to where analysis is performed

" + }, + split_experiment_by_institutions: { + type: "bool", + default_val: "False", + tooltip: "

Set this to true for analysis by institutions

" + } }, - "optimize": { - "subNodes": [ - "tune_model", - "ensemble_model", - "blend_models", - "stack_models", - "calibrate_model" - ], - "options": {}, - "code": "" + code: "", + default: { + files: { + type: "data-input", + tooltip: "

Specify path to csv file or to medomics folder

" + } + } + }, + optimize: { + subNodes: ["tune_model", "ensemble_model", "blend_models", "stack_models", "calibrate_model"], + options: {}, + code: "" + }, + compare_models: { + options: { + include: { + type: "list-multiple", + tooltip: + "

To train and evaluate select models, list containing model ID or scikit-learn\ncompatible object can be passed in include param. To see a list of all models\navailable in the model library use the Model node.

\n", + default_val: "None", + choices: { + lr: "Logistic Regression", + knn: "K Neighbors Classifier", + nb: "Naive Bayes", + dt: "Decision Tree Classifier", + svm: "SVM - Linear Kernel", + rbfsvm: "SVM - Radial Kernel", + gpc: "Gaussian Process Classifier", + mlp: "MLP Classifier", + ridge: "Ridge Classifier", + rf: "Random Forest Classifier", + qda: "Quadratic Discriminant Analysis", + ada: "Ada Boost Classifier", + gbc: "Gradient Boosting Classifier", + lda: "Linear Discriminant Analysis", + et: "Extra Trees Classifier", + dummy: "Dummy Classifier", + xgboost: "Extreme Gradient Boosting", + lightgbm: "Light Gradient Boosting Machine", + catboost: "CatBoost Classifier" + } + }, + exclude: { + type: "list-multiple", + tooltip: + "

To omit certain models from training and evaluation, pass a list containing\nmodel id in the exclude parameter. To see a list of all models available\nin the model library use the Model node.

\n", + default_val: "None", + choices: { + lr: "Logistic Regression", + knn: "K Neighbors Classifier", + nb: "Naive Bayes", + dt: "Decision Tree Classifier", + svm: "SVM - Linear Kernel", + rbfsvm: "SVM - Radial Kernel", + gpc: "Gaussian Process Classifier", + mlp: "MLP Classifier", + ridge: "Ridge Classifier", + rf: "Random Forest Classifier", + qda: "Quadratic Discriminant Analysis", + ada: "Ada Boost Classifier", + gbc: "Gradient Boosting Classifier", + lda: "Linear Discriminant Analysis", + et: "Extra Trees Classifier", + dummy: "Dummy Classifier", + xgboost: "Extreme Gradient Boosting", + lightgbm: "Light Gradient Boosting Machine", + catboost: "CatBoost Classifier" + } + }, + fold: { + type: "int", + tooltip: + "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", + default_val: "None" + }, + round: { + type: "int", + tooltip: "

Number of decimal places the metrics in the score grid will be rounded to.

\n", + default_val: "4" + }, + cross_validation: { + type: "bool", + tooltip: "

When set to False, metrics are evaluated on holdout set. fold param\nis ignored when cross_validation is set to False.

\n", + default_val: "True" + }, + sort: { + type: "string", + tooltip: "

The sort order of the score grid. It also accepts custom metrics that are\nadded through the add_metric function.

\n", + default_val: "Accuracy" + }, + n_select: { + type: "int", + tooltip: "

Number of top_n models to return. For example, to select top 3 models use\nn_select = 3.

\n", + default_val: "1" + }, + budget_time: { + type: "float", + tooltip: "

If not None, will terminate execution of the function after budget_time\nminutes have passed and return results up to that point.

\n", + default_val: "None" + }, + turbo: { + type: "bool", + tooltip: "

When set to True, it excludes estimators with longer training times. To\nsee which algorithms are excluded use the models function.

\n", + default_val: "True" + }, + errors: { + type: "string", + tooltip: "

When set to \u2018ignore\u2019, will skip the model with exceptions and continue.\nIf \u2018raise\u2019, will break the function when exceptions are raised.

\n", + default_val: "ignore" + }, + fit_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the fit method of the model.

\n", + default_val: "{} (empty dict)" + }, + groups: { + type: "string", + tooltip: + "

Optional group labels when \u2018GroupKFold\u2019 is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in the training dataset. When string is passed, it is interpreted\nas the column name in the dataset containing group labels.

\n", + default_val: "None" + }, + experiment_custom_tags: { + type: "dict", + tooltip: "

Dictionary of tag_name: String -> value: (String, but will be string-ified\nif not) passed to the mlflow.set_tags to add new custom tags for the experiment.

\n", + default_val: "None" + }, + probability_threshold: { + type: "float", + tooltip: + "

Threshold for converting predicted probability to class label.\nIt defaults to 0.5 for all classifiers unless explicitly defined\nin this parameter. Only applicable for binary classification.

\n", + default_val: "None" + }, + engine: { + type: "Optional[Dict[str, str]] = None", + tooltip: + "

The execution engines to use for the models in the form of a dict\nof model_id: engine - e.g. for Logistic Regression (\u201clr\u201d, users can\nswitch between \u201csklearn\u201d and \u201csklearnex\u201d by specifying\nengine={\u201clr\u201d: \u201csklearnex\u201d}

\n", + default_val: "" + }, + verbose: { + type: "bool", + tooltip: "

Score grid is not printed when verbose is set to False.

\n", + default_val: "True" + } }, - "compare_models": { - "options": { - "include": { - "type": "list-multiple", - "tooltip": "

To train and evaluate select models, list containing model ID or scikit-learn\ncompatible object can be passed in include param. To see a list of all models\navailable in the model library use the Model node.

\n", - "default_val": "None", - "choices": { - "lr": "Logistic Regression", - "knn": "K Neighbors Classifier", - "nb": "Naive Bayes", - "dt": "Decision Tree Classifier", - "svm": "SVM - Linear Kernel", - "rbfsvm": "SVM - Radial Kernel", - "gpc": "Gaussian Process Classifier", - "mlp": "MLP Classifier", - "ridge": "Ridge Classifier", - "rf": "Random Forest Classifier", - "qda": "Quadratic Discriminant Analysis", - "ada": "Ada Boost Classifier", - "gbc": "Gradient Boosting Classifier", - "lda": "Linear Discriminant Analysis", - "et": "Extra Trees Classifier", - "dummy": "Dummy Classifier", - "xgboost": "Extreme Gradient Boosting", - "lightgbm": "Light Gradient Boosting Machine", - "catboost": "CatBoost Classifier" - } - }, - "exclude": { - "type": "list-multiple", - "tooltip": "

To omit certain models from training and evaluation, pass a list containing\nmodel id in the exclude parameter. To see a list of all models available\nin the model library use the Model node.

\n", - "default_val": "None", - "choices": { - "lr": "Logistic Regression", - "knn": "K Neighbors Classifier", - "nb": "Naive Bayes", - "dt": "Decision Tree Classifier", - "svm": "SVM - Linear Kernel", - "rbfsvm": "SVM - Radial Kernel", - "gpc": "Gaussian Process Classifier", - "mlp": "MLP Classifier", - "ridge": "Ridge Classifier", - "rf": "Random Forest Classifier", - "qda": "Quadratic Discriminant Analysis", - "ada": "Ada Boost Classifier", - "gbc": "Gradient Boosting Classifier", - "lda": "Linear Discriminant Analysis", - "et": "Extra Trees Classifier", - "dummy": "Dummy Classifier", - "xgboost": "Extreme Gradient Boosting", - "lightgbm": "Light Gradient Boosting Machine", - "catboost": "CatBoost Classifier" - } - }, - "fold": { - "type": "int", - "tooltip": "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", - "default_val": "None" - }, - "round": { - "type": "int", - "tooltip": "

Number of decimal places the metrics in the score grid will be rounded to.

\n", - "default_val": "4" - }, - "cross_validation": { - "type": "bool", - "tooltip": "

When set to False, metrics are evaluated on holdout set. fold param\nis ignored when cross_validation is set to False.

\n", - "default_val": "True" - }, - "sort": { - "type": "string", - "tooltip": "

The sort order of the score grid. It also accepts custom metrics that are\nadded through the add_metric function.

\n", - "default_val": "Accuracy" - }, - "n_select": { - "type": "int", - "tooltip": "

Number of top_n models to return. For example, to select top 3 models use\nn_select = 3.

\n", - "default_val": "1" - }, - "budget_time": { - "type": "float", - "tooltip": "

If not None, will terminate execution of the function after budget_time\nminutes have passed and return results up to that point.

\n", - "default_val": "None" - }, - "turbo": { - "type": "bool", - "tooltip": "

When set to True, it excludes estimators with longer training times. To\nsee which algorithms are excluded use the models function.

\n", - "default_val": "True" - }, - "errors": { - "type": "string", - "tooltip": "

When set to \u2018ignore\u2019, will skip the model with exceptions and continue.\nIf \u2018raise\u2019, will break the function when exceptions are raised.

\n", - "default_val": "ignore" - }, - "fit_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the fit method of the model.

\n", - "default_val": "{} (empty dict)" - }, - "groups": { - "type": "string", - "tooltip": "

Optional group labels when \u2018GroupKFold\u2019 is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in the training dataset. When string is passed, it is interpreted\nas the column name in the dataset containing group labels.

\n", - "default_val": "None" - }, - "experiment_custom_tags": { - "type": "dict", - "tooltip": "

Dictionary of tag_name: String -> value: (String, but will be string-ified\nif not) passed to the mlflow.set_tags to add new custom tags for the experiment.

\n", - "default_val": "None" - }, - "probability_threshold": { - "type": "float", - "tooltip": "

Threshold for converting predicted probability to class label.\nIt defaults to 0.5 for all classifiers unless explicitly defined\nin this parameter. Only applicable for binary classification.

\n", - "default_val": "None" - }, - "engine": { - "type": "Optional[Dict[str, str]] = None", - "tooltip": "

The execution engines to use for the models in the form of a dict\nof model_id: engine - e.g. for Logistic Regression (\u201clr\u201d, users can\nswitch between \u201csklearn\u201d and \u201csklearnex\u201d by specifying\nengine={\u201clr\u201d: \u201csklearnex\u201d}

\n", - "default_val": "" - }, - "verbose": { - "type": "bool", - "tooltip": "

Score grid is not printed when verbose is set to False.

\n", - "default_val": "True" - } - }, - "code": " " + code: " " + }, + create_model: { + options: { + fold: { + type: "int", + tooltip: + "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", + default_val: "None" + }, + round: { + type: "int", + tooltip: "

Number of decimal places the metrics in the score grid will be rounded to.

\n", + default_val: "4" + }, + cross_validation: { + type: "bool", + tooltip: "

When set to False, metrics are evaluated on holdout set. fold param\nis ignored when cross_validation is set to False.

\n", + default_val: "True" + }, + fit_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the fit method of the model.

\n", + default_val: "{} (empty dict)" + }, + groups: { + type: "string", + tooltip: + "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", + default_val: "None" + }, + probability_threshold: { + type: "float", + tooltip: + "

Threshold for converting predicted probability to class label.\nIt defaults to 0.5 for all classifiers unless explicitly defined\nin this parameter. Only applicable for binary classification.

\n", + default_val: "None" + }, + experiment_custom_tags: { + type: "dict", + tooltip: "

Dictionary of tag_name: String -> value: (String, but will be string-ified\nif not) passed to the mlflow.set_tags to add new custom tags for the experiment.

\n", + default_val: "None" + }, + engine: { + type: "Optional[str] = None", + tooltip: + "

The execution engine to use for the model, e.g. for Logistic Regression (\u201clr\u201d), users can\nswitch between \u201csklearn\u201d and \u201csklearnex\u201d by specifying\nengine=\u201dsklearnex\u201d.

\n", + default_val: "" + }, + verbose: { + type: "bool", + tooltip: "

Score grid is not printed when verbose is set to False.

\n", + default_val: "True" + }, + return_train_score: { + type: "bool", + tooltip: + "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", + default_val: "False" + } }, - "create_model": { - "options": { - "fold": { - "type": "int", - "tooltip": "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", - "default_val": "None" - }, - "round": { - "type": "int", - "tooltip": "

Number of decimal places the metrics in the score grid will be rounded to.

\n", - "default_val": "4" - }, - "cross_validation": { - "type": "bool", - "tooltip": "

When set to False, metrics are evaluated on holdout set. fold param\nis ignored when cross_validation is set to False.

\n", - "default_val": "True" - }, - "fit_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the fit method of the model.

\n", - "default_val": "{} (empty dict)" - }, - "groups": { - "type": "string", - "tooltip": "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", - "default_val": "None" - }, - "probability_threshold": { - "type": "float", - "tooltip": "

Threshold for converting predicted probability to class label.\nIt defaults to 0.5 for all classifiers unless explicitly defined\nin this parameter. Only applicable for binary classification.

\n", - "default_val": "None" - }, - "experiment_custom_tags": { - "type": "dict", - "tooltip": "

Dictionary of tag_name: String -> value: (String, but will be string-ified\nif not) passed to the mlflow.set_tags to add new custom tags for the experiment.

\n", - "default_val": "None" - }, - "engine": { - "type": "Optional[str] = None", - "tooltip": "

The execution engine to use for the model, e.g. for Logistic Regression (\u201clr\u201d), users can\nswitch between \u201csklearn\u201d and \u201csklearnex\u201d by specifying\nengine=\u201dsklearnex\u201d.

\n", - "default_val": "" - }, - "verbose": { - "type": "bool", - "tooltip": "

Score grid is not printed when verbose is set to False.

\n", - "default_val": "True" - }, - "return_train_score": { - "type": "bool", - "tooltip": "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", - "default_val": "False" - } + code: "", + default: {} + }, + analyze: { + plot_model: { + options: { + plot: { + type: "string", + tooltip: + "

List of available plots (ID - Name):

\n
    \n
  • \u2018pipeline\u2019 - Schematic drawing of the preprocessing pipeline

  • \n
  • \u2018auc\u2019 - Area Under the Curve

  • \n
  • \u2018threshold\u2019 - Discrimination Threshold

  • \n
  • \u2018pr\u2019 - Precision Recall Curve

  • \n
  • \u2018confusion_matrix\u2019 - Confusion Matrix

  • \n
  • \u2018error\u2019 - Class Prediction Error

  • \n
  • \u2018class_report\u2019 - Classification Report

  • \n
  • \u2018boundary\u2019 - Decision Boundary

  • \n
  • \u2018rfe\u2019 - Recursive Feature Selection

  • \n
  • \u2018learning\u2019 - Learning Curve

  • \n
  • \u2018manifold\u2019 - Manifold Learning

  • \n
  • \u2018calibration\u2019 - Calibration Curve

  • \n
  • \u2018vc\u2019 - Validation Curve

  • \n
  • \u2018dimension\u2019 - Dimension Learning

  • \n
  • \u2018feature\u2019 - Feature Importance

  • \n
  • \u2018feature_all\u2019 - Feature Importance (All)

  • \n
  • \u2018parameter\u2019 - Model Hyperparameter

  • \n
  • \u2018lift\u2019 - Lift Curve

  • \n
  • \u2018gain\u2019 - Gain Chart

  • \n
  • \u2018tree\u2019 - Decision Tree

  • \n
  • \u2018ks\u2019 - KS Statistic Plot

  • \n
\n", + default_val: "auc" }, - "code": "", - "default": {} - }, - "analyze": { - "plot_model": { - "options": { - "plot": { - "type": "string", - "tooltip": "

List of available plots (ID - Name):

\n
    \n
  • \u2018pipeline\u2019 - Schematic drawing of the preprocessing pipeline

  • \n
  • \u2018auc\u2019 - Area Under the Curve

  • \n
  • \u2018threshold\u2019 - Discrimination Threshold

  • \n
  • \u2018pr\u2019 - Precision Recall Curve

  • \n
  • \u2018confusion_matrix\u2019 - Confusion Matrix

  • \n
  • \u2018error\u2019 - Class Prediction Error

  • \n
  • \u2018class_report\u2019 - Classification Report

  • \n
  • \u2018boundary\u2019 - Decision Boundary

  • \n
  • \u2018rfe\u2019 - Recursive Feature Selection

  • \n
  • \u2018learning\u2019 - Learning Curve

  • \n
  • \u2018manifold\u2019 - Manifold Learning

  • \n
  • \u2018calibration\u2019 - Calibration Curve

  • \n
  • \u2018vc\u2019 - Validation Curve

  • \n
  • \u2018dimension\u2019 - Dimension Learning

  • \n
  • \u2018feature\u2019 - Feature Importance

  • \n
  • \u2018feature_all\u2019 - Feature Importance (All)

  • \n
  • \u2018parameter\u2019 - Model Hyperparameter

  • \n
  • \u2018lift\u2019 - Lift Curve

  • \n
  • \u2018gain\u2019 - Gain Chart

  • \n
  • \u2018tree\u2019 - Decision Tree

  • \n
  • \u2018ks\u2019 - KS Statistic Plot

  • \n
\n", - "default_val": "auc" - }, - "scale": { - "type": "float", - "tooltip": "

The resolution scale of the figure.

\n", - "default_val": "1" - }, - "fold": { - "type": "int", - "tooltip": "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", - "default_val": "None" - }, - "fit_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the fit method of the model.

\n", - "default_val": "{} (empty dict)" - }, - "plot_kwargs": { - "type": "dict", - "tooltip": "
\n
Dictionary of arguments passed to the visualizer class.
    \n
  • pipeline: fontsize -> int

  • \n
\n
\n
\n", - "default_val": "{} (empty dict)" - }, - "groups": { - "type": "string", - "tooltip": "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", - "default_val": "None" - }, - "verbose": { - "type": "bool", - "tooltip": "

When set to False, backend's progress bar is not displayed.

\n", - "default_val": "True" - }, - "display_format": { - "type": "string", - "tooltip": "

To display plots in Streamlit (https://www.streamlit.io/), set this to \u2018streamlit\u2019.\nCurrently, not all plots are supported.

\n", - "default_val": "None" - } - }, - "code": "plot_model()", - "default": {} + scale: { + type: "float", + tooltip: "

The resolution scale of the figure.

\n", + default_val: "1" + }, + fold: { + type: "int", + tooltip: + "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", + default_val: "None" + }, + fit_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the fit method of the model.

\n", + default_val: "{} (empty dict)" + }, + plot_kwargs: { + type: "dict", + tooltip: "
\n
Dictionary of arguments passed to the visualizer class.
    \n
  • pipeline: fontsize -> int

  • \n
\n
\n
\n", + default_val: "{} (empty dict)" }, - "interpret_model": { - "options": { - "plot": { - "type": "string", - "tooltip": "

Abbreviation of type of plot. The current list of plots supported\nare (Plot - Name):\n* \u2018summary\u2019 - Summary Plot using SHAP\n* \u2018correlation\u2019 - Dependence Plot using SHAP\n* \u2018reason\u2019 - Force Plot using SHAP\n* \u2018pdp\u2019 - Partial Dependence Plot\n* \u2018msa\u2019 - Morris Sensitivity Analysis\n* \u2018pfi\u2019 - Permutation Feature Importance

\n", - "default_val": "summary" - }, - "feature": { - "type": "string", - "tooltip": "

This parameter is only needed when plot = \u2018correlation\u2019 or \u2018pdp\u2019.\nBy default feature is set to None which means the first column of the\ndataset will be used as a variable. A feature parameter must be passed\nto change this.

\n", - "default_val": "None" - }, - "observation": { - "type": "int", - "tooltip": "

This parameter only comes into effect when plot is set to \u2018reason\u2019. If no\nobservation number is provided, it will return an analysis of all observations\nwith the option to select the feature on x and y axes through drop down\ninteractivity. For analysis at the sample level, an observation parameter must\nbe passed with the index value of the observation in test / hold-out set.

\n", - "default_val": "None" - }, - "use_train_data": { - "type": "bool", - "tooltip": "

When set to true, train data will be used for plots, instead\nof test data.

\n", - "default_val": "False" - }, - "X_new_sample": { - "type": "dataframe", - "tooltip": "

Row from an out-of-sample dataframe (neither train nor test data) to be plotted.\nThe sample must have the same columns as the raw input train data, and it is transformed\nby the preprocessing pipeline automatically before plotting.

\n", - "default_val": "None" - }, - "y_new_sample": { - "type": "dataframe", - "tooltip": "

Row from an out-of-sample dataframe (neither train nor test data) to be plotted.\nThe sample must have the same columns as the raw input label data, and it is transformed\nby the preprocessing pipeline automatically before plotting.

\n", - "default_val": "None" - } - }, - "code": "interpret_model()" + groups: { + type: "string", + tooltip: + "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", + default_val: "None" }, - "dashboard": { - "options": { - "display_format": { - "type": "string", - "tooltip": "

Render mode for the dashboard. The default is set to dash which will\nrender a dashboard in browser. There are four possible options:

\n
    \n
  • \u2018dash\u2019 - displays the dashboard in browser

  • \n
  • \u2018inline\u2019 - displays the dashboard in the jupyter notebook cell.

  • \n
  • \u2018jupyterlab\u2019 - displays the dashboard in jupyterlab pane.

  • \n
  • \u2018external\u2019 - displays the dashboard in a separate tab. (use in Colab)

  • \n
\n", - "default_val": "dash" - }, - "dashboard_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the ExplainerDashboard class.

\n", - "default_val": "{} (empty dict)" - }, - "run_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the run method of ExplainerDashboard.

\n", - "default_val": "{} (empty dict)" - } - }, - "code": "dashboard()", - "default": {} + verbose: { + type: "bool", + tooltip: "

When set to False, backend's progress bar is not displayed.

\n", + default_val: "True" + }, + display_format: { + type: "string", + tooltip: "

To display plots in Streamlit (https://www.streamlit.io/), set this to \u2018streamlit\u2019.\nCurrently, not all plots are supported.

\n", + default_val: "None" } + }, + code: "plot_model()", + default: {} }, - "finalize": { - "options": { - "fit_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the fit method of the model.

\n", - "default_val": "{} (empty dict)" - }, - "groups": { - "type": "string", - "tooltip": "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", - "default_val": "None" - }, - "model_only": { - "type": "bool", - "tooltip": "

Whether to return the complete fitted pipeline or only the fitted model.

\n", - "default_val": "False" - }, - "experiment_custom_tags": { - "type": "dict", - "tooltip": "

Dictionary of tag_name: String -> value: (String, but will be string-ified\nif not) passed to the mlflow.set_tags to add new custom tags for the experiment.

\n", - "default_val": "None" - } + interpret_model: { + options: { + plot: { + type: "string", + tooltip: + "

Abbreviation of type of plot. The current list of plots supported\nare (Plot - Name):\n* \u2018summary\u2019 - Summary Plot using SHAP\n* \u2018correlation\u2019 - Dependence Plot using SHAP\n* \u2018reason\u2019 - Force Plot using SHAP\n* \u2018pdp\u2019 - Partial Dependence Plot\n* \u2018msa\u2019 - Morris Sensitivity Analysis\n* \u2018pfi\u2019 - Permutation Feature Importance

\n", + default_val: "summary" }, - "code": "", - "default": {} - }, - "save_model": { - "options": { - "model_name": { - "type": "string", - "tooltip": "

Name of the model.

\n", - "default_val": "model" - }, - "model_only": { - "type": "bool", - "tooltip": "

When set to True, only trained model object is saved instead of the\nentire pipeline.

\n", - "default_val": "False" - }, - "verbose": { - "type": "bool", - "tooltip": "

Success message is not printed when verbose is set to False.

\n", - "default_val": "True" - } + feature: { + type: "string", + tooltip: + "

This parameter is only needed when plot = \u2018correlation\u2019 or \u2018pdp\u2019.\nBy default feature is set to None which means the first column of the\ndataset will be used as a variable. A feature parameter must be passed\nto change this.

\n", + default_val: "None" }, - "code": "", - "default": {} - }, - "load_model": { - "options": { - "platform": { - "type": "string", - "tooltip": "

Name of the cloud platform. Currently supported platforms:\n\u2018aws\u2019, \u2018gcp\u2019 and \u2018azure\u2019.

\n", - "default_val": "None" - }, - "authentication": { - "type": "dict", - "tooltip": "

dictionary of applicable authentication tokens.

\n

when platform = \u2018aws\u2019:\n{\u2018bucket\u2019 : \u2018Name of Bucket on S3\u2019, \u2018path\u2019: (optional) folder name under the bucket}

\n

when platform = \u2018gcp\u2019:\n{\u2018project\u2019: \u2018gcp-project-name\u2019, \u2018bucket\u2019 : \u2018gcp-bucket-name\u2019}

\n

when platform = \u2018azure\u2019:\n{\u2018container\u2019: \u2018azure-container-name\u2019}

\n", - "default_val": "None" - }, - "verbose": { - "type": "bool", - "tooltip": "

Success message is not printed when verbose is set to False.

\n", - "default_val": "True" - } + observation: { + type: "int", + tooltip: + "

This parameter only comes into effect when plot is set to \u2018reason\u2019. If no\nobservation number is provided, it will return an analysis of all observations\nwith the option to select the feature on x and y axes through drop down\ninteractivity. For analysis at the sample level, an observation parameter must\nbe passed with the index value of the observation in test / hold-out set.

\n", + default_val: "None" }, - "code": "", - "default": { - "model_to_load": { - "type": "models-input", - "tooltip": "

Choose a model from the MODELS folder

" - } + use_train_data: { + type: "bool", + tooltip: "

When set to true, train data will be used for plots, instead\nof test data.

\n", + default_val: "False" + }, + X_new_sample: { + type: "dataframe", + tooltip: + "

Row from an out-of-sample dataframe (neither train nor test data) to be plotted.\nThe sample must have the same columns as the raw input train data, and it is transformed\nby the preprocessing pipeline automatically before plotting.

\n", + default_val: "None" + }, + y_new_sample: { + type: "dataframe", + tooltip: + "

Row from an out-of-sample dataframe (neither train nor test data) to be plotted.\nThe sample must have the same columns as the raw input label data, and it is transformed\nby the preprocessing pipeline automatically before plotting.

\n", + default_val: "None" } + }, + code: "interpret_model()" }, - "tune_model": { - "options": { - "fold": { - "type": "int", - "tooltip": "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", - "default_val": "None" - }, - "round": { - "type": "int", - "tooltip": "

Number of decimal places the metrics in the score grid will be rounded to.

\n", - "default_val": "4" - }, - "n_iter": { - "type": "int", - "tooltip": "

Number of iterations in the grid search. Increasing \u2018n_iter\u2019 may improve\nmodel performance but also increases the training time.

\n", - "default_val": "10" - }, - "custom_grid": { - "type": "dict", - "tooltip": "

To define custom search space for hyperparameters, pass a dictionary with\nparameter name and values to be iterated. Custom grids must be in a format\nsupported by the defined search_library.

\n", - "default_val": "None" - }, - "optimize": { - "type": "string", - "tooltip": "

Metric name to be evaluated for hyperparameter tuning. It also accepts custom\nmetrics that are added through the add_metric function.

\n", - "default_val": "Accuracy" - }, - "custom_scorer": { - "type": "object", - "tooltip": "

custom scoring strategy can be passed to tune hyperparameters of the model.\nIt must be created using sklearn.make_scorer. It is equivalent of adding\ncustom metric using the add_metric function and passing the name of the\ncustom metric in the optimize parameter.\nWill be deprecated in future.

\n", - "default_val": "None" - }, - "search_library": { - "type": "string", - "tooltip": "

The search library used for tuning hyperparameters. Possible values:

\n
    \n
  • \n
    \u2018scikit-learn\u2019 - default, requires no further installation

    https://github.com/scikit-learn/scikit-learn

    \n
    \n
    \n
  • \n
  • \n
    \u2018scikit-optimize\u2019 - pip install scikit-optimize

    https://scikit-optimize.github.io/stable/

    \n
    \n
    \n
  • \n
  • \n
    \u2018tune-sklearn\u2019 - pip install tune-sklearn ray[tune]

    https://github.com/ray-project/tune-sklearn

    \n
    \n
    \n
  • \n
  • \n
    \u2018optuna\u2019 - pip install optuna

    https://optuna.org/

    \n
    \n
    \n
  • \n
\n", - "default_val": "scikit-learn" - }, - "search_algorithm": { - "type": "string", - "tooltip": "

The search algorithm depends on the search_library parameter.\nSome search algorithms require additional libraries to be installed.\nIf None, will use search library-specific default algorithm.

\n
    \n
  • \n
    \u2018scikit-learn\u2019 possible values:
      \n
    • \u2018random\u2019 : random grid search (default)

    • \n
    • \u2018grid\u2019 : grid search

    • \n
    \n
    \n
    \n
  • \n
  • \n
    \u2018scikit-optimize\u2019 possible values:
      \n
    • \u2018bayesian\u2019 : Bayesian search (default)

    • \n
    \n
    \n
    \n
  • \n
  • \n
    \u2018tune-sklearn\u2019 possible values:
      \n
    • \u2018random\u2019 : random grid search (default)

    • \n
    • \u2018grid\u2019 : grid search

    • \n
    • \u2018bayesian\u2019 : pip install scikit-optimize

    • \n
    • \u2018hyperopt\u2019 : pip install hyperopt

    • \n
    • \u2018optuna\u2019 : pip install optuna

    • \n
    • \u2018bohb\u2019 : pip install hpbandster ConfigSpace

    • \n
    \n
    \n
    \n
  • \n
  • \n
    \u2018optuna\u2019 possible values:
      \n
    • \u2018random\u2019 : randomized search

    • \n
    • \u2018tpe\u2019 : Tree-structured Parzen Estimator search (default)

    • \n
    \n
    \n
    \n
  • \n
\n", - "default_val": "None" - }, - "early_stopping": { - "type": "string", - "tooltip": "

Use early stopping to stop fitting to a hyperparameter configuration\nif it performs poorly. Ignored when search_library is scikit-learn,\nor if the estimator does not have \u2018partial_fit\u2019 attribute. If False or\nNone, early stopping will not be used. Can be either an object accepted\nby the search library or one of the following:

\n
    \n
  • \u2018asha\u2019 for Asynchronous Successive Halving Algorithm

  • \n
  • \u2018hyperband\u2019 for Hyperband

  • \n
  • \u2018median\u2019 for Median Stopping Rule

  • \n
  • If False or None, early stopping will not be used.

  • \n
\n", - "default_val": "False" - }, - "early_stopping_max_iters": { - "type": "int", - "tooltip": "

Maximum number of epochs to run for each sampled configuration.\nIgnored if early_stopping is False or None.

\n", - "default_val": "10" - }, - "choose_better": { - "type": "bool", - "tooltip": "

When set to True, the returned object is always better performing. The\nmetric used for comparison is defined by the optimize parameter.

\n", - "default_val": "True" - }, - "fit_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the fit method of the tuner.

\n", - "default_val": "{} (empty dict)" - }, - "groups": { - "type": "string", - "tooltip": "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", - "default_val": "None" - }, - "return_tuner": { - "type": "bool", - "tooltip": "

When set to True, will return a tuple of (model, tuner_object).

\n", - "default_val": "False" - }, - "verbose": { - "type": "bool", - "tooltip": "

Score grid is not printed when verbose is set to False.

\n", - "default_val": "True" - }, - "tuner_verbose": { - "type": "int", - "tooltip": "

If True or above 0, will print messages from the tuner. Higher values\nprint more messages. Ignored when verbose param is False.

\n", - "default_val": 0 - }, - "return_train_score": { - "type": "bool", - "tooltip": "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", - "default_val": "False" - } + dashboard: { + options: { + display_format: { + type: "string", + tooltip: + "

Render mode for the dashboard. The default is set to dash which will\nrender a dashboard in browser. There are four possible options:

\n
    \n
  • \u2018dash\u2019 - displays the dashboard in browser

  • \n
  • \u2018inline\u2019 - displays the dashboard in the jupyter notebook cell.

  • \n
  • \u2018jupyterlab\u2019 - displays the dashboard in jupyterlab pane.

  • \n
  • \u2018external\u2019 - displays the dashboard in a separate tab. (use in Colab)

  • \n
\n", + default_val: "dash" }, - "ml_types": "classification regression survival_analysis", - "code": "tune_model()", - "default": {} - }, - "ensemble_model": { - "options": { - "method": { - "type": "string", - "tooltip": "

Method for ensembling base estimator. It can be \u2018Bagging\u2019 or \u2018Boosting\u2019.

\n", - "default_val": "Bagging" - }, - "fold": { - "type": "int", - "tooltip": "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", - "default_val": "None" - }, - "n_estimators": { - "type": "int", - "tooltip": "

The number of base estimators in the ensemble. In case of perfect fit, the\nlearning procedure is stopped early.

\n", - "default_val": "10" - }, - "round": { - "type": "int", - "tooltip": "

Number of decimal places the metrics in the score grid will be rounded to.

\n", - "default_val": "4" - }, - "choose_better": { - "type": "bool", - "tooltip": "

When set to True, the returned object is always better performing. The\nmetric used for comparison is defined by the optimize parameter.

\n", - "default_val": "False" - }, - "optimize": { - "type": "string", - "tooltip": "

Metric to compare for model selection when choose_better is True.

\n", - "default_val": "Accuracy" - }, - "fit_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the fit method of the model.

\n", - "default_val": "{} (empty dict)" - }, - "groups": { - "type": "string", - "tooltip": "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", - "default_val": "None" - }, - "probability_threshold": { - "type": "float", - "tooltip": "

Threshold for converting predicted probability to class label.\nIt defaults to 0.5 for all classifiers unless explicitly defined\nin this parameter. Only applicable for binary classification.

\n", - "default_val": "None" - }, - "verbose": { - "type": "bool", - "tooltip": "

Score grid is not printed when verbose is set to False.

\n", - "default_val": "True" - }, - "return_train_score": { - "type": "bool", - "tooltip": "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", - "default_val": "False" - } + dashboard_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the ExplainerDashboard class.

\n", + default_val: "{} (empty dict)" }, - "ml_types": "classification regression", - "code": "ensemble_model()", - "default": {} + run_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the run method of ExplainerDashboard.

\n", + default_val: "{} (empty dict)" + } + }, + code: "dashboard()", + default: {} + } + }, + finalize: { + options: { + fit_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the fit method of the model.

\n", + default_val: "{} (empty dict)" + }, + groups: { + type: "string", + tooltip: + "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", + default_val: "None" + }, + model_only: { + type: "bool", + tooltip: "

Whether to return the complete fitted pipeline or only the fitted model.

\n", + default_val: "False" + }, + experiment_custom_tags: { + type: "dict", + tooltip: "

Dictionary of tag_name: String -> value: (String, but will be string-ified\nif not) passed to the mlflow.set_tags to add new custom tags for the experiment.

\n", + default_val: "None" + } }, - "blend_models": { - "options": { - "fold": { - "type": "int", - "tooltip": "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", - "default_val": "None" - }, - "round": { - "type": "int", - "tooltip": "

Number of decimal places the metrics in the score grid will be rounded to.

\n", - "default_val": "4" - }, - "choose_better": { - "type": "bool", - "tooltip": "

When set to True, the returned object is always better performing. The\nmetric used for comparison is defined by the optimize parameter.

\n", - "default_val": "False" - }, - "optimize": { - "type": "string", - "tooltip": "

Metric to compare for model selection when choose_better is True.

\n", - "default_val": "Accuracy" - }, - "method": { - "type": "string", - "tooltip": "

\u2018hard\u2019 uses predicted class labels for majority rule voting. \u2018soft\u2019, predicts\nthe class label based on the argmax of the sums of the predicted probabilities,\nwhich is recommended for an ensemble of well-calibrated classifiers. Default\nvalue, \u2018auto\u2019, will try to use \u2018soft\u2019 and fall back to \u2018hard\u2019 if the former is\nnot supported.

\n", - "default_val": "auto" - }, - "weights": { - "type": "custom-list", - "tooltip": "

Sequence of weights (float or int) to weight the occurrences of predicted class\nlabels (hard voting) or class probabilities before averaging (soft voting). Uses\nuniform weights when None.

\n", - "default_val": "None" - }, - "fit_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the fit method of the model.

\n", - "default_val": "{} (empty dict)" - }, - "groups": { - "type": "string", - "tooltip": "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", - "default_val": "None" - }, - "probability_threshold": { - "type": "float", - "tooltip": "

Threshold for converting predicted probability to class label.\nIt defaults to 0.5 for all classifiers unless explicitly defined\nin this parameter. Only applicable for binary classification.

\n", - "default_val": "None" - }, - "verbose": { - "type": "bool", - "tooltip": "

Score grid is not printed when verbose is set to False.

\n", - "default_val": "True" - }, - "return_train_score": { - "type": "bool", - "tooltip": "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", - "default_val": "False" - } - }, - "ml_types": "classification regression", - "code": "blend_models()", - "default": {} + code: "", + default: {} + }, + save_model: { + options: { + model_name: { + type: "string", + tooltip: "

Name of the model.

\n", + default_val: "model" + }, + model_only: { + type: "bool", + tooltip: "

When set to True, only trained model object is saved instead of the\nentire pipeline.

\n", + default_val: "False" + }, + verbose: { + type: "bool", + tooltip: "

Success message is not printed when verbose is set to False.

\n", + default_val: "True" + } }, - "stack_models": { - "options": { - "meta_model": { - "type": "list", - "tooltip": "

When None, Logistic Regression is trained as a meta model.

\n", - "default_val": "None", - "choices": { - "lr": "Logistic Regression", - "knn": "K Neighbors Classifier", - "nb": "Naive Bayes", - "dt": "Decision Tree Classifier", - "svm": "SVM - Linear Kernel", - "rbfsvm": "SVM - Radial Kernel", - "gpc": "Gaussian Process Classifier", - "mlp": "MLP Classifier", - "ridge": "Ridge Classifier", - "rf": "Random Forest Classifier", - "qda": "Quadratic Discriminant Analysis", - "ada": "Ada Boost Classifier", - "gbc": "Gradient Boosting Classifier", - "lda": "Linear Discriminant Analysis", - "et": "Extra Trees Classifier", - "xgboost": "Extreme Gradient Boosting", - "lightgbm": "Light Gradient Boosting Machine", - "catboost": "CatBoost Classifier" - } - }, - "meta_model_fold": { - "type": "int", - "tooltip": "

Controls internal cross-validation. Can be an integer or a scikit-learn\nCV generator. If set to an integer, will use (Stratifed)KFold CV with\nthat many folds. See scikit-learn documentation on Stacking for\nmore details.

\n", - "default_val": "5" - }, - "fold": { - "type": "int", - "tooltip": "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", - "default_val": "None" - }, - "round": { - "type": "int", - "tooltip": "

Number of decimal places the metrics in the score grid will be rounded to.

\n", - "default_val": "4" - }, - "method": { - "type": "string", - "tooltip": "

When set to \u2018auto\u2019, it will invoke, for each estimator, \u2018predict_proba\u2019,\n\u2018decision_function\u2019 or \u2018predict\u2019 in that order. Other, manually pass one\nof the value from \u2018predict_proba\u2019, \u2018decision_function\u2019 or \u2018predict\u2019.

\n", - "default_val": "auto" - }, - "restack": { - "type": "bool", - "tooltip": "

When set to False, only the predictions of estimators will be used as\ntraining data for the meta_model.

\n", - "default_val": "False" - }, - "choose_better": { - "type": "bool", - "tooltip": "

When set to True, the returned object is always better performing. The\nmetric used for comparison is defined by the optimize parameter.

\n", - "default_val": "False" - }, - "optimize": { - "type": "string", - "tooltip": "

Metric to compare for model selection when choose_better is True.

\n", - "default_val": "Accuracy" - }, - "fit_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the fit method of the model.

\n", - "default_val": "{} (empty dict)" - }, - "groups": { - "type": "string", - "tooltip": "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", - "default_val": "None" - }, - "probability_threshold": { - "type": "float", - "tooltip": "

Threshold for converting predicted probability to class label.\nIt defaults to 0.5 for all classifiers unless explicitly defined\nin this parameter. Only applicable for binary classification.

\n", - "default_val": "None" - }, - "verbose": { - "type": "bool", - "tooltip": "

Score grid is not printed when verbose is set to False.

\n", - "default_val": "True" - }, - "return_train_score": { - "type": "bool", - "tooltip": "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", - "default_val": "False" - } - }, - "ml_types": "classification regression", - "code": "stack_models()", - "default": {} + code: "", + default: {} + }, + load_model: { + options: { + platform: { + type: "string", + tooltip: "

Name of the cloud platform. Currently supported platforms:\n\u2018aws\u2019, \u2018gcp\u2019 and \u2018azure\u2019.

\n", + default_val: "None" + }, + authentication: { + type: "dict", + tooltip: + "

dictionary of applicable authentication tokens.

\n

when platform = \u2018aws\u2019:\n{\u2018bucket\u2019 : \u2018Name of Bucket on S3\u2019, \u2018path\u2019: (optional) folder name under the bucket}

\n

when platform = \u2018gcp\u2019:\n{\u2018project\u2019: \u2018gcp-project-name\u2019, \u2018bucket\u2019 : \u2018gcp-bucket-name\u2019}

\n

when platform = \u2018azure\u2019:\n{\u2018container\u2019: \u2018azure-container-name\u2019}

\n", + default_val: "None" + }, + verbose: { + type: "bool", + tooltip: "

Success message is not printed when verbose is set to False.

\n", + default_val: "True" + } }, - "calibrate_model": { - "options": { - "method": { - "type": "string", - "tooltip": "

The method to use for calibration. Can be \u2018sigmoid\u2019 which corresponds to\nPlatt\u2019s method or \u2018isotonic\u2019 which is a non-parametric approach.

\n", - "default_val": "sigmoid" - }, - "calibrate_fold": { - "type": "int", - "tooltip": "

Controls internal cross-validation. Can be an integer or a scikit-learn\nCV generator. If set to an integer, will use (Stratifed)KFold CV with\nthat many folds. See scikit-learn documentation on Stacking for\nmore details.

\n", - "default_val": "5" - }, - "fold": { - "type": "int", - "tooltip": "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", - "default_val": "None" - }, - "round": { - "type": "int", - "tooltip": "

Number of decimal places the metrics in the score grid will be rounded to.

\n", - "default_val": "4" - }, - "fit_kwargs": { - "type": "dict", - "tooltip": "

Dictionary of arguments passed to the fit method of the model.

\n", - "default_val": "{} (empty dict)" - }, - "groups": { - "type": "string", - "tooltip": "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", - "default_val": "None" - }, - "verbose": { - "type": "bool", - "tooltip": "

Score grid is not printed when verbose is set to False.

\n", - "default_val": "True" - }, - "return_train_score": { - "type": "bool", - "tooltip": "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", - "default_val": "False" - } - }, - "ml_types": "classification", - "code": "calibrate_model()", - "default": {} + code: "", + default: { + model_to_load: { + type: "models-input", + tooltip: "

Choose a model from the MODELS folder

" + } } -}; - export default classificationSettings; \ No newline at end of file + }, + tune_model: { + options: { + fold: { + type: "int", + tooltip: + "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", + default_val: "None" + }, + round: { + type: "int", + tooltip: "

Number of decimal places the metrics in the score grid will be rounded to.

\n", + default_val: "4" + }, + n_iter: { + type: "int", + tooltip: "

Number of iterations in the grid search. Increasing \u2018n_iter\u2019 may improve\nmodel performance but also increases the training time.

\n", + default_val: "10" + }, + custom_grid: { + type: "dict", + tooltip: + "

To define custom search space for hyperparameters, pass a dictionary with\nparameter name and values to be iterated. Custom grids must be in a format\nsupported by the defined search_library.

\n", + default_val: "None" + }, + optimize: { + type: "string", + tooltip: "

Metric name to be evaluated for hyperparameter tuning. It also accepts custom\nmetrics that are added through the add_metric function.

\n", + default_val: "Accuracy" + }, + custom_scorer: { + type: "object", + tooltip: + "

custom scoring strategy can be passed to tune hyperparameters of the model.\nIt must be created using sklearn.make_scorer. It is equivalent of adding\ncustom metric using the add_metric function and passing the name of the\ncustom metric in the optimize parameter.\nWill be deprecated in future.

\n", + default_val: "None" + }, + search_library: { + type: "string", + tooltip: + "

The search library used for tuning hyperparameters. Possible values:

\n
    \n
  • \n
    \u2018scikit-learn\u2019 - default, requires no further installation

    https://github.com/scikit-learn/scikit-learn

    \n
    \n
    \n
  • \n
  • \n
    \u2018scikit-optimize\u2019 - pip install scikit-optimize

    https://scikit-optimize.github.io/stable/

    \n
    \n
    \n
  • \n
  • \n
    \u2018tune-sklearn\u2019 - pip install tune-sklearn ray[tune]

    https://github.com/ray-project/tune-sklearn

    \n
    \n
    \n
  • \n
  • \n
    \u2018optuna\u2019 - pip install optuna

    https://optuna.org/

    \n
    \n
    \n
  • \n
\n", + default_val: "scikit-learn" + }, + search_algorithm: { + type: "string", + tooltip: + "

The search algorithm depends on the search_library parameter.\nSome search algorithms require additional libraries to be installed.\nIf None, will use search library-specific default algorithm.

\n
    \n
  • \n
    \u2018scikit-learn\u2019 possible values:
      \n
    • \u2018random\u2019 : random grid search (default)

    • \n
    • \u2018grid\u2019 : grid search

    • \n
    \n
    \n
    \n
  • \n
  • \n
    \u2018scikit-optimize\u2019 possible values:
      \n
    • \u2018bayesian\u2019 : Bayesian search (default)

    • \n
    \n
    \n
    \n
  • \n
  • \n
    \u2018tune-sklearn\u2019 possible values:
      \n
    • \u2018random\u2019 : random grid search (default)

    • \n
    • \u2018grid\u2019 : grid search

    • \n
    • \u2018bayesian\u2019 : pip install scikit-optimize

    • \n
    • \u2018hyperopt\u2019 : pip install hyperopt

    • \n
    • \u2018optuna\u2019 : pip install optuna

    • \n
    • \u2018bohb\u2019 : pip install hpbandster ConfigSpace

    • \n
    \n
    \n
    \n
  • \n
  • \n
    \u2018optuna\u2019 possible values:
      \n
    • \u2018random\u2019 : randomized search

    • \n
    • \u2018tpe\u2019 : Tree-structured Parzen Estimator search (default)

    • \n
    \n
    \n
    \n
  • \n
\n", + default_val: "None" + }, + early_stopping: { + type: "string", + tooltip: + "

Use early stopping to stop fitting to a hyperparameter configuration\nif it performs poorly. Ignored when search_library is scikit-learn,\nor if the estimator does not have \u2018partial_fit\u2019 attribute. If False or\nNone, early stopping will not be used. Can be either an object accepted\nby the search library or one of the following:

\n
    \n
  • \u2018asha\u2019 for Asynchronous Successive Halving Algorithm

  • \n
  • \u2018hyperband\u2019 for Hyperband

  • \n
  • \u2018median\u2019 for Median Stopping Rule

  • \n
  • If False or None, early stopping will not be used.

  • \n
\n", + default_val: "False" + }, + early_stopping_max_iters: { + type: "int", + tooltip: "

Maximum number of epochs to run for each sampled configuration.\nIgnored if early_stopping is False or None.

\n", + default_val: "10" + }, + choose_better: { + type: "bool", + tooltip: "

When set to True, the returned object is always better performing. The\nmetric used for comparison is defined by the optimize parameter.

\n", + default_val: "True" + }, + fit_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the fit method of the tuner.

\n", + default_val: "{} (empty dict)" + }, + groups: { + type: "string", + tooltip: + "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", + default_val: "None" + }, + return_tuner: { + type: "bool", + tooltip: "

When set to True, will return a tuple of (model, tuner_object).

\n", + default_val: "False" + }, + verbose: { + type: "bool", + tooltip: "

Score grid is not printed when verbose is set to False.

\n", + default_val: "True" + }, + tuner_verbose: { + type: "int", + tooltip: "

If True or above 0, will print messages from the tuner. Higher values\nprint more messages. Ignored when verbose param is False.

\n", + default_val: 0 + }, + return_train_score: { + type: "bool", + tooltip: + "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", + default_val: "False" + } + }, + ml_types: "classification regression survival_analysis", + code: "tune_model()", + default: {} + }, + ensemble_model: { + options: { + method: { + type: "string", + tooltip: "

Method for ensembling base estimator. It can be \u2018Bagging\u2019 or \u2018Boosting\u2019.

\n", + default_val: "Bagging" + }, + fold: { + type: "int", + tooltip: + "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", + default_val: "None" + }, + n_estimators: { + type: "int", + tooltip: "

The number of base estimators in the ensemble. In case of perfect fit, the\nlearning procedure is stopped early.

\n", + default_val: "10" + }, + round: { + type: "int", + tooltip: "

Number of decimal places the metrics in the score grid will be rounded to.

\n", + default_val: "4" + }, + choose_better: { + type: "bool", + tooltip: "

When set to True, the returned object is always better performing. The\nmetric used for comparison is defined by the optimize parameter.

\n", + default_val: "False" + }, + optimize: { + type: "string", + tooltip: "

Metric to compare for model selection when choose_better is True.

\n", + default_val: "Accuracy" + }, + fit_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the fit method of the model.

\n", + default_val: "{} (empty dict)" + }, + groups: { + type: "string", + tooltip: + "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", + default_val: "None" + }, + probability_threshold: { + type: "float", + tooltip: + "

Threshold for converting predicted probability to class label.\nIt defaults to 0.5 for all classifiers unless explicitly defined\nin this parameter. Only applicable for binary classification.

\n", + default_val: "None" + }, + verbose: { + type: "bool", + tooltip: "

Score grid is not printed when verbose is set to False.

\n", + default_val: "True" + }, + return_train_score: { + type: "bool", + tooltip: + "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", + default_val: "False" + } + }, + ml_types: "classification regression", + code: "ensemble_model()", + default: {} + }, + blend_models: { + options: { + fold: { + type: "int", + tooltip: + "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", + default_val: "None" + }, + round: { + type: "int", + tooltip: "

Number of decimal places the metrics in the score grid will be rounded to.

\n", + default_val: "4" + }, + choose_better: { + type: "bool", + tooltip: "

When set to True, the returned object is always better performing. The\nmetric used for comparison is defined by the optimize parameter.

\n", + default_val: "False" + }, + optimize: { + type: "string", + tooltip: "

Metric to compare for model selection when choose_better is True.

\n", + default_val: "Accuracy" + }, + method: { + type: "string", + tooltip: + "

\u2018hard\u2019 uses predicted class labels for majority rule voting. \u2018soft\u2019, predicts\nthe class label based on the argmax of the sums of the predicted probabilities,\nwhich is recommended for an ensemble of well-calibrated classifiers. Default\nvalue, \u2018auto\u2019, will try to use \u2018soft\u2019 and fall back to \u2018hard\u2019 if the former is\nnot supported.

\n", + default_val: "auto" + }, + weights: { + type: "custom-list", + tooltip: + "

Sequence of weights (float or int) to weight the occurrences of predicted class\nlabels (hard voting) or class probabilities before averaging (soft voting). Uses\nuniform weights when None.

\n", + default_val: "None" + }, + fit_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the fit method of the model.

\n", + default_val: "{} (empty dict)" + }, + groups: { + type: "string", + tooltip: + "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", + default_val: "None" + }, + probability_threshold: { + type: "float", + tooltip: + "

Threshold for converting predicted probability to class label.\nIt defaults to 0.5 for all classifiers unless explicitly defined\nin this parameter. Only applicable for binary classification.

\n", + default_val: "None" + }, + verbose: { + type: "bool", + tooltip: "

Score grid is not printed when verbose is set to False.

\n", + default_val: "True" + }, + return_train_score: { + type: "bool", + tooltip: + "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", + default_val: "False" + } + }, + ml_types: "classification regression", + code: "blend_models()", + default: {} + }, + stack_models: { + options: { + meta_model: { + type: "list", + tooltip: "

When None, Logistic Regression is trained as a meta model.

\n", + default_val: "None", + choices: { + lr: "Logistic Regression", + knn: "K Neighbors Classifier", + nb: "Naive Bayes", + dt: "Decision Tree Classifier", + svm: "SVM - Linear Kernel", + rbfsvm: "SVM - Radial Kernel", + gpc: "Gaussian Process Classifier", + mlp: "MLP Classifier", + ridge: "Ridge Classifier", + rf: "Random Forest Classifier", + qda: "Quadratic Discriminant Analysis", + ada: "Ada Boost Classifier", + gbc: "Gradient Boosting Classifier", + lda: "Linear Discriminant Analysis", + et: "Extra Trees Classifier", + xgboost: "Extreme Gradient Boosting", + lightgbm: "Light Gradient Boosting Machine", + catboost: "CatBoost Classifier" + } + }, + meta_model_fold: { + type: "int", + tooltip: + "

Controls internal cross-validation. Can be an integer or a scikit-learn\nCV generator. If set to an integer, will use (Stratifed)KFold CV with\nthat many folds. See scikit-learn documentation on Stacking for\nmore details.

\n", + default_val: "5" + }, + fold: { + type: "int", + tooltip: + "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", + default_val: "None" + }, + round: { + type: "int", + tooltip: "

Number of decimal places the metrics in the score grid will be rounded to.

\n", + default_val: "4" + }, + method: { + type: "string", + tooltip: + "

When set to \u2018auto\u2019, it will invoke, for each estimator, \u2018predict_proba\u2019,\n\u2018decision_function\u2019 or \u2018predict\u2019 in that order. Other, manually pass one\nof the value from \u2018predict_proba\u2019, \u2018decision_function\u2019 or \u2018predict\u2019.

\n", + default_val: "auto" + }, + restack: { + type: "bool", + tooltip: "

When set to False, only the predictions of estimators will be used as\ntraining data for the meta_model.

\n", + default_val: "False" + }, + choose_better: { + type: "bool", + tooltip: "

When set to True, the returned object is always better performing. The\nmetric used for comparison is defined by the optimize parameter.

\n", + default_val: "False" + }, + optimize: { + type: "string", + tooltip: "

Metric to compare for model selection when choose_better is True.

\n", + default_val: "Accuracy" + }, + fit_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the fit method of the model.

\n", + default_val: "{} (empty dict)" + }, + groups: { + type: "string", + tooltip: + "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", + default_val: "None" + }, + probability_threshold: { + type: "float", + tooltip: + "

Threshold for converting predicted probability to class label.\nIt defaults to 0.5 for all classifiers unless explicitly defined\nin this parameter. Only applicable for binary classification.

\n", + default_val: "None" + }, + verbose: { + type: "bool", + tooltip: "

Score grid is not printed when verbose is set to False.

\n", + default_val: "True" + }, + return_train_score: { + type: "bool", + tooltip: + "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", + default_val: "False" + } + }, + ml_types: "classification regression", + code: "stack_models()", + default: {} + }, + calibrate_model: { + options: { + method: { + type: "string", + tooltip: "

The method to use for calibration. Can be \u2018sigmoid\u2019 which corresponds to\nPlatt\u2019s method or \u2018isotonic\u2019 which is a non-parametric approach.

\n", + default_val: "sigmoid" + }, + calibrate_fold: { + type: "int", + tooltip: + "

Controls internal cross-validation. Can be an integer or a scikit-learn\nCV generator. If set to an integer, will use (Stratifed)KFold CV with\nthat many folds. See scikit-learn documentation on Stacking for\nmore details.

\n", + default_val: "5" + }, + fold: { + type: "int", + tooltip: + "

Controls cross-validation. If None, the CV generator in the fold_strategy\nparameter of the setup function is used. When an integer is passed,\nit is interpreted as the \u2018n_splits\u2019 parameter of the CV generator in the\nsetup function.

\n", + default_val: "None" + }, + round: { + type: "int", + tooltip: "

Number of decimal places the metrics in the score grid will be rounded to.

\n", + default_val: "4" + }, + fit_kwargs: { + type: "dict", + tooltip: "

Dictionary of arguments passed to the fit method of the model.

\n", + default_val: "{} (empty dict)" + }, + groups: { + type: "string", + tooltip: + "

Optional group labels when GroupKFold is used for the cross validation.\nIt takes an array with shape (n_samples, ) where n_samples is the number\nof rows in training dataset. When string is passed, it is interpreted as\nthe column name in the dataset containing group labels.

\n", + default_val: "None" + }, + verbose: { + type: "bool", + tooltip: "

Score grid is not printed when verbose is set to False.

\n", + default_val: "True" + }, + return_train_score: { + type: "bool", + tooltip: + "

If False, returns the CV Validation scores only.\nIf True, returns the CV training scores along with the CV validation scores.\nThis is useful when the user wants to do bias-variance tradeoff. A high CV\ntraining score with a low corresponding CV validation score indicates overfitting.

\n", + default_val: "False" + } + }, + ml_types: "classification", + code: "calibrate_model()", + default: {} + }, + group_models: { + options: {}, + code: "" + } +} +export default classificationSettings diff --git a/renderer/public/setupVariables/possibleSettings/learning/regressionSettings.js b/renderer/public/setupVariables/possibleSettings/learning/regressionSettings.js index 0d4d1b0d..d8b7ec64 100644 --- a/renderer/public/setupVariables/possibleSettings/learning/regressionSettings.js +++ b/renderer/public/setupVariables/possibleSettings/learning/regressionSettings.js @@ -1008,6 +1008,10 @@ const regressionSettings = { "ml_types": "classification regression", "code": "stack_models()", "default": {} + }, + group_models: { + options: {}, + code: "" } }; export default regressionSettings; \ No newline at end of file diff --git a/renderer/utilities/learning/inputTypesUtils.js b/renderer/utilities/learning/inputTypesUtils.js index 191fc7e4..82f9a5a5 100644 --- a/renderer/utilities/learning/inputTypesUtils.js +++ b/renderer/utilities/learning/inputTypesUtils.js @@ -109,8 +109,12 @@ const implementedTypesDescription = [ type: "int-float-str", description: "int-float-str", default: "" + }, + { + type: "dataframe", + description: "for dataframe input", + default: "" } - ] // this object is used to get the default value and implemeted types of possible settings diff --git a/renderer/utilities/requests.js b/renderer/utilities/requests.js index bb34c781..cc5120d2 100644 --- a/renderer/utilities/requests.js +++ b/renderer/utilities/requests.js @@ -78,7 +78,7 @@ export const axiosPostJsonGo = async (port, topic, json2send, jsonReceivedCB, on try { cleanResponse = JSON.parse(nanToNull(response.data.response_message)) } catch (error) { - cleanResponse = JSON.parse(parsingCleaning(nanToNull(response.data.response_message))) + cleanResponse = JSON.parse(parsingCleaning(nanToNull(response.data.response_message)).replaceAll("\\", "")) } jsonReceivedCB(cleanResponse) } else { @@ -145,7 +145,10 @@ export const axiosPostJson = async (jsonData, pathName) => { * @returns the cleaned response */ const parsingCleaning = (response) => { - return response.substring(response.indexOf("{"), response.lastIndexOf("}") + 1) + // console.log("Parsing cleaning (before):", response) + let newResponse = response.substring(response.indexOf("{"), response.lastIndexOf("}") + 1) + // console.log("Parsing cleaning (after):", newResponse) + return newResponse } /**