From 6d492bbb28cdfa4d77ca15a47fdbb4ba6aa8a616 Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Mon, 24 Jun 2024 20:19:54 +0200 Subject: [PATCH 1/5] TrainingStorage: Add train_test_split method --- .../atomistics/job/trainingcontainer.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pyiron_potentialfit/atomistics/job/trainingcontainer.py b/pyiron_potentialfit/atomistics/job/trainingcontainer.py index ea8f29fa..0d09e562 100644 --- a/pyiron_potentialfit/atomistics/job/trainingcontainer.py +++ b/pyiron_potentialfit/atomistics/job/trainingcontainer.py @@ -658,3 +658,33 @@ def plot(self): if self._plots is None: self._plots = TrainingPlots(self) return self._plots + + def train_test_split(self, train_size, seed=None): + """ + Split into two random sub sets for training and testing. + + Args: + train_size (float): fraction of data points for the training set, must be within (0, 1) + seed (optional): how to initialize the RNG, see numpy.random.default_rng() for details, but an int will do + + Returns: + (TrainingStorage, TrainingStorage): two training storages for training and testing + """ + if not (0 < train_size < 1): + raise ValueError(f"train_size must be within (0,1), not {train_size}!") + rng = np.random.default_rng(seed) + brk = int(len(self) * train_size) + if brk in (0, 1): + raise ValueError( + f"container not large enough to realize this split, only multiples of {1/len(self)} possible!" + ) + + # somewhat inefficient, but probably good enough for normal training set sizes + idx = np.arange(len(self)) + rng.shuffle(idx) + test_idx = idx[:brk] + train_idx = idx[brk:] + return ( + self.sample(lambda f, i: i in test_idx), + self.sample(lambda f, i: i in train_idx) + ) From 03a49127f4228f3286760e123e884ef8fd16c6f7 Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Mon, 24 Jun 2024 21:22:33 +0200 Subject: [PATCH 2/5] TrainingContainer: Add include_storage method Adds a TrainingStorage to a TrainingContainer, similar to include_dataset. Move Container beneath Storage definitions for type hinting --- .../atomistics/job/trainingcontainer.py | 797 +++++++++--------- 1 file changed, 403 insertions(+), 394 deletions(-) diff --git a/pyiron_potentialfit/atomistics/job/trainingcontainer.py b/pyiron_potentialfit/atomistics/job/trainingcontainer.py index 0d09e562..5913c323 100644 --- a/pyiron_potentialfit/atomistics/job/trainingcontainer.py +++ b/pyiron_potentialfit/atomistics/job/trainingcontainer.py @@ -45,55 +45,167 @@ from pyiron_snippets.deprecate import deprecate -class TrainingContainer(GenericJob, HasStructure): - """ - Stores ASE structures with energies and forces. - """ +_HDF_KEYS = { + "energy": "output/generic/energy_pot", + "forces": "output/generic/forces", + "stress": "output/generic/pressures", + "indices": "output/generic/indices", + "species": "input/structure/species", + "cell": "output/generic/cells", + "positions": "output/generic/positions", + "pbc": "input/structure/cell/pbc", +} - def __init__(self, project, job_name): - super().__init__(project=project, job_name=job_name) - self.__name__ = "TrainingContainer" - self.__hdf_version__ = "0.3.0" - self._container = TrainingStorage() +_JOB_HDF_OVERLAY_KEYS = { + "Vasp": { + # For DFT one should use the smeared energy to obtain values + # consistent with the forces, but the default energy_pot of DFT + # jobs is the energy extrapolated to zero smearing + "energy": "output/generic/dft/energy_free", + # HACK: VASP work-around, current contents of pressures are meaningless, correct values are in + # output/generic/stresses + "stress": "output/generic/stresses", + } +} - self.input = DataContainer( - {"save_neighbors": True, "num_neighbors": 12}, table_name="parameters" - ) - def include_job(self, job, iteration_step=-1): +class TrainingStorage(StructureStorage): + def __init__(self): + super().__init__() + self.add_array("energy", dtype=np.float64, per="chunk", fill=np.nan) + self._table_cache = None + self.to_pandas() + + def to_pandas(self): """ - Add structure, energy and forces from job. + Export list of structure to pandas table for external fitting codes. - Args: - job (:class:`.AtomisticGenericJob`): job to take structure from - iteration_step (int, optional): if job has multiple steps, this - selects which to add + The table contains the following columns: + - 'name': human-readable name of the structure + - 'ase_atoms': the structure as a :class:`.Atoms` object + - 'energy': the energy of the full structure + - 'forces': the per atom forces as a :class:`numpy.ndarray`, shape Nx3 + - 'stress': the per structure stress as a :class:`numpy.ndarray`, shape 6 + - 'number_of_atoms': the number of atoms in the structure, N + + Returns: + :class:`pandas.DataFrame`: collected structures """ - self._container.include_job(job, iteration_step) + if self._table_cache is None or len(self._table_cache) != len(self): + self._table_cache = pd.DataFrame( + { + "name": [self.get_array("identifier", i) for i in range(len(self))], + "atoms": [self.get_structure(i) for i in range(len(self))], + "energy": [self.get_array("energy", i) for i in range(len(self))], + } + ) + if self.has_array("forces"): + self._table_cache["forces"] = [ + self.get_array("forces", i) for i in range(len(self)) + ] + if self.has_array("stress"): + self._table_cache["stress"] = [ + self.get_array("stress", i) for i in range(len(self)) + ] + self._table_cache["number_of_atoms"] = [ + len(s) for s in self._table_cache.atoms + ] + return self._table_cache - def include_structure(self, structure, energy=None, name=None, **properties): + def include_job( + self, + job, + iteration_step=-1, + hdf_keys=None, + ): """ - Add new structure to structure list and save energy and forces with it. + Add structure, energy, forces and pressures from an inspected or loaded job. - For consistency with the rest of pyiron, energy should be in units of eV - and forces in eV/A, but no conversion is performed. + The job must be an atomistic job. + + Forces and stresses are only added if present in the output. + + The locations in the job HDF5 file from which the training data is read + can be customized by passing a dictionary as `hdf_keys`, where the + values must be paths inside the job HDF5 file. The available keys are + given below together with their default values. + + * `energy`: `output/generic/energy_pot` + * `forces`: `output/generic/forces` + * `stress`: `output/generic/pressures` + * `indices`: `output/generic/indices` + * `cell`: `output/generic/cells` + * `positions`: `output/generic/positions` + * `species`: `input/structure/species` + * `pbc`: `input/structure/cell/pbc` + + Other keys are ignored. All entries except `pbc` and `species` must + lead to arrays that can be indexed by `iteration_step`. + + For `Vasp` jobs the defaults are changed like this + + * `energy`: `output/generic/dft/energy_free` + * `stress`: `output/generic/stresses` + + to ensure that by default energies, forces and stresses are consistent. Args: - structure_or_job (:class:`~.Atoms`): structure to add - energy (float): energy of the whole structure - forces (Nx3 array of float, optional): per atom forces, where N is - the number of atoms in the structure - stress (6 array of float, optional): per structure stresses in voigt - notation - name (str, optional): name describing the structure + job (:class:`.JobPath`, :class:`.AtomisticGenericJob`): job (path) to take structure from + iteration_step (int, optional): if job has multiple steps, this selects which to add + hdf_keys (dict of str): customize where values are read from the + job HDF5 file """ - self._container.include_structure( - structure, name=name, energy=energy, **properties + + hdf_keys = _HDF_KEYS.copy() + hdf_keys.update(_JOB_HDF_OVERLAY_KEYS.get(job["NAME"], {})) + + kwargs = { + "energy": job[hdf_keys["energy"]][iteration_step], + } + ff = job[hdf_keys["forces"]] + if ff is not None: + kwargs["forces"] = ff[iteration_step] + + pp = job[hdf_keys["stress"]] + if pp is not None and len(pp) > 0: + stress = np.asarray(pp[iteration_step]) + if stress.shape == (3, 3): + stress = np.array( + [ + stress[0, 0], + stress[1, 1], + stress[2, 2], + stress[1, 2], + stress[0, 2], + stress[0, 1], + ] + ) + kwargs["stress"] = stress + + ii = job[hdf_keys["indices"]] + if ii is not None: + indices = ii[iteration_step] + else: + # not all jobs save indices again in the output, but all atomistic + # jobs do it in the input + indices = job["input/structure/indices"] + species = np.asarray(job[hdf_keys["species"]]) + cell = job[hdf_keys["cell"]][iteration_step] + positions = job[hdf_keys["positions"]][iteration_step] + pbc = job[hdf_keys["pbc"]] + + self.add_chunk( + len(indices), + identifier=job.name, + symbols=species[indices], + positions=positions, + cell=[cell], + pbc=[pbc], + **kwargs, ) - def add_structure( - self, structure, energy, forces=None, stress=None, identifier=None, **arrays - ): + @deprecate("Use add_structure instead") + def include_structure(self, structure, energy, name=None, **properties): """ Add new structure to structure list and save energy and forces with it. @@ -107,14 +219,21 @@ def add_structure( stress (6 array of float, optional): per structure stresses in voigt notation name (str, optional): name describing the structure """ - self._container.add_structure( - structure, - energy, - identifier=identifier, - forces=forces, - stress=stress, - **arrays, - ) + self.add_structure(structure, identifier=name, energy=energy, **properties) + + def add_structure( + self, structure: Atoms, energy, identifier=None, **arrays + ) -> None: + if "forces" in arrays and not self.has_array("forces"): + self.add_array( + "forces", shape=(3,), dtype=np.float64, per="element", fill=np.nan + ) + if "stress" in arrays and not self.has_array("stress"): + # save stress in voigt notation + self.add_array( + "stress", shape=(6,), dtype=np.float64, per="chunk", fill=np.nan + ) + super().add_structure(structure, identifier=identifier, energy=energy, **arrays) def include_dataset(self, dataset): """ @@ -125,44 +244,256 @@ def include_dataset(self, dataset): - atoms(:class:`ase.Atoms`): the atomic structure - energy(float): energy of the whole structure - forces (Nx3 array of float): per atom forces, where N is the number of atoms in the structure + - charges (Nx3 array of floats): - stress (6 array of float): per structure stress in voigt notation """ - self._container.include_dataset(dataset) - - def _get_structure(self, frame=-1, wrap_atoms=True): - return self._container.get_structure(frame=frame, wrap_atoms=wrap_atoms) - - def _number_of_structures(self): - return self._container.number_of_structures + if ( + "name" not in dataset.columns + or "atoms" not in dataset.columns + or "energy" not in dataset.columns + ): + raise ValueError( + "At least columns 'name', 'atoms' and 'energy' must be present in dataset!" + ) + for row in dataset.itertuples(index=False): + kwargs = {} + if hasattr(row, "forces"): + kwargs["forces"] = row.forces + if hasattr(row, "stress"): + kwargs["stress"] = row.stress + self.add_structure( + row.atoms, energy=row.energy, identifier=row.name, **kwargs + ) - def get_neighbors(self, num_neighbors=None): + def to_list(self, filter_function=None): """ - Calculate and add neighbor information in each structure. - - If input.save_neighbors is True the data is automatically added to the internal storage and will be saved - together with the normal structure data. + Returns the data as lists of pyiron structures, energies, forces, and the number of atoms Args: - num_neighbors (int, optional): Number of neighbors to collect, if not given use value from input + filter_function (function): Function applied to the dataset (which is a pandas DataFrame) to filter it Returns: - NeighborsTrajectory: neighbor information - """ - if num_neighbors is None: - num_neighbors = self.input.num_neighbors - n = NeighborsTrajectory( - has_structure=self, - store=self._container if self.input.save_neighbors else None, - num_neighbors=num_neighbors, - ) - n.compute_neighbors() - return n - - def get_elements(self): + tuple: list of structures, energies, forces, and the number of atoms """ - Return a list of chemical elements in the training set. - - Returns: + data_table = self.to_pandas() + if filter_function is not None: + data_table = filter_function(data_table) + structure_list = data_table.atoms.to_list() + energy_list = data_table.energy.to_list() + if "forces" not in data_table.columns: + raise ValueError("no forces defined in storage; call to_dict() instead.") + force_list = data_table.forces.to_list() + num_atoms_list = data_table.number_of_atoms.to_list() + + return (structure_list, energy_list, force_list, num_atoms_list) + + def to_dict(self) -> Dict[str, Any]: + """Return a dictionary of all structures and training properties.""" + dict_arrays = {} + + # Get structure information. + dict_arrays["structure"] = list(self.iter_structures()) + + # Some arrays are only for internal usage or structure information that + # was already saved in dict['structure']. + internal_arrays = [ + "start_index", + "length", + "cell", + "pbc", + "positions", + "symbols", + ] + for array in self.list_arrays(): + # Skip internal arrays. + if array in internal_arrays: + continue + + dict_arrays[array] = self.get_array_ragged(array) + return dict_arrays + + def iter(self, *arrays, wrap_atoms=True): + """ + Iterate over all structures in this object and all arrays that are defined + + Args: + wrap_atoms (bool): True if the atoms are to be wrapped back into the unit cell; passed to + :meth:`.get_structure()` + *arrays (str): name of arrays that should be iterated over + + Yields: + :class:`pyiron_atomistics.atomistitcs.structure.atoms.Atoms`, arrays: every structure attached to the object and queried arrays + """ + array_vals = (self.get_array_ragged(a) for a in arrays) + yield from zip(self.iter_structures(), *array_vals) + + @property + def plot(self): + """ + :class:`.TrainingPlots`: plotting interface + """ + if self._plots is None: + self._plots = TrainingPlots(self) + return self._plots + + def train_test_split(self, train_size, seed=None): + """ + Split into two random sub sets for training and testing. + + Args: + train_size (float): fraction of data points for the training set, must be within (0, 1) + seed (optional): how to initialize the RNG, see numpy.random.default_rng() for details, but an int will do + + Returns: + (TrainingStorage, TrainingStorage): two training storages for training and testing + """ + if not (0 < train_size < 1): + raise ValueError(f"train_size must be within (0,1), not {train_size}!") + rng = np.random.default_rng(seed) + brk = int(len(self) * train_size) + if brk in (0, 1): + raise ValueError( + f"container not large enough to realize this split, only multiples of {1/len(self)} possible!" + ) + + # somewhat inefficient, but probably good enough for normal training set sizes + idx = np.arange(len(self)) + rng.shuffle(idx) + test_idx = idx[:brk] + train_idx = idx[brk:] + return ( + self.sample(lambda f, i: i in test_idx), + self.sample(lambda f, i: i in train_idx) + ) + + +class TrainingContainer(GenericJob, HasStructure): + """ + Stores ASE structures with energies and forces. + """ + + def __init__(self, project, job_name): + super().__init__(project=project, job_name=job_name) + self.__name__ = "TrainingContainer" + self.__hdf_version__ = "0.3.0" + self._container = TrainingStorage() + + self.input = DataContainer( + {"save_neighbors": True, "num_neighbors": 12}, table_name="parameters" + ) + + def include_job(self, job, iteration_step=-1): + """ + Add structure, energy and forces from job. + + Args: + job (:class:`.AtomisticGenericJob`): job to take structure from + iteration_step (int, optional): if job has multiple steps, this + selects which to add + """ + self._container.include_job(job, iteration_step) + + def include_structure(self, structure, energy=None, name=None, **properties): + """ + Add new structure to structure list and save energy and forces with it. + + For consistency with the rest of pyiron, energy should be in units of eV + and forces in eV/A, but no conversion is performed. + + Args: + structure_or_job (:class:`~.Atoms`): structure to add + energy (float): energy of the whole structure + forces (Nx3 array of float, optional): per atom forces, where N is + the number of atoms in the structure + stress (6 array of float, optional): per structure stresses in voigt + notation + name (str, optional): name describing the structure + """ + self._container.include_structure( + structure, name=name, energy=energy, **properties + ) + + def add_structure( + self, structure, energy, forces=None, stress=None, identifier=None, **arrays + ): + """ + Add new structure to structure list and save energy and forces with it. + + For consistency with the rest of pyiron, energy should be in units of eV and forces in eV/A, but no conversion + is performed. + + Args: + structure_or_job (:class:`~.Atoms`): structure to add + energy (float): energy of the whole structure + forces (Nx3 array of float, optional): per atom forces, where N is the number of atoms in the structure + stress (6 array of float, optional): per structure stresses in voigt notation + name (str, optional): name describing the structure + """ + self._container.add_structure( + structure, + energy, + identifier=identifier, + forces=forces, + stress=stress, + **arrays, + ) + + def include_dataset(self, dataset): + """ + Add a pandas DataFrame to the saved structures. + + The dataframe should have the following columns: + - name: human readable name of the structure + - atoms(:class:`ase.Atoms`): the atomic structure + - energy(float): energy of the whole structure + - forces (Nx3 array of float): per atom forces, where N is the number of atoms in the structure + - stress (6 array of float): per structure stress in voigt notation + """ + self._container.include_dataset(dataset) + + def include_storage(self, storage: TrainingStorage): + """ + Add a :class:`.TrainingStorage` to the saved structures. + + Args: + storage (:class:`.TrainingStorage`): structures to add + """ + self._container.extend(storage) + + def _get_structure(self, frame=-1, wrap_atoms=True): + return self._container.get_structure(frame=frame, wrap_atoms=wrap_atoms) + + def _number_of_structures(self): + return self._container.number_of_structures + + def get_neighbors(self, num_neighbors=None): + """ + Calculate and add neighbor information in each structure. + + If input.save_neighbors is True the data is automatically added to the internal storage and will be saved + together with the normal structure data. + + Args: + num_neighbors (int, optional): Number of neighbors to collect, if not given use value from input + + Returns: + NeighborsTrajectory: neighbor information + """ + if num_neighbors is None: + num_neighbors = self.input.num_neighbors + n = NeighborsTrajectory( + has_structure=self, + store=self._container if self.input.save_neighbors else None, + num_neighbors=num_neighbors, + ) + n.compute_neighbors() + return n + + def get_elements(self): + """ + Return a list of chemical elements in the training set. + + Returns: :class:`list`: list of unique elements in the training set as strings of their standard abbreviations """ return self._container.get_elements() @@ -366,325 +697,3 @@ def forces(self, axis: Optional[int] = None): f = f.ravel() plt.hist(f, bins=20) plt.xlabel(r"Force [eV/$\mathrm{\AA}$]") - - -_HDF_KEYS = { - "energy": "output/generic/energy_pot", - "forces": "output/generic/forces", - "stress": "output/generic/pressures", - "indices": "output/generic/indices", - "species": "input/structure/species", - "cell": "output/generic/cells", - "positions": "output/generic/positions", - "pbc": "input/structure/cell/pbc", -} - -_JOB_HDF_OVERLAY_KEYS = { - "Vasp": { - # For DFT one should use the smeared energy to obtain values - # consistent with the forces, but the default energy_pot of DFT - # jobs is the energy extrapolated to zero smearing - "energy": "output/generic/dft/energy_free", - # HACK: VASP work-around, current contents of pressures are meaningless, correct values are in - # output/generic/stresses - "stress": "output/generic/stresses", - } -} - - -class TrainingStorage(StructureStorage): - def __init__(self): - super().__init__() - self.add_array("energy", dtype=np.float64, per="chunk", fill=np.nan) - self._table_cache = None - self.to_pandas() - - def to_pandas(self): - """ - Export list of structure to pandas table for external fitting codes. - - The table contains the following columns: - - 'name': human-readable name of the structure - - 'ase_atoms': the structure as a :class:`.Atoms` object - - 'energy': the energy of the full structure - - 'forces': the per atom forces as a :class:`numpy.ndarray`, shape Nx3 - - 'stress': the per structure stress as a :class:`numpy.ndarray`, shape 6 - - 'number_of_atoms': the number of atoms in the structure, N - - Returns: - :class:`pandas.DataFrame`: collected structures - """ - if self._table_cache is None or len(self._table_cache) != len(self): - self._table_cache = pd.DataFrame( - { - "name": [self.get_array("identifier", i) for i in range(len(self))], - "atoms": [self.get_structure(i) for i in range(len(self))], - "energy": [self.get_array("energy", i) for i in range(len(self))], - } - ) - if self.has_array("forces"): - self._table_cache["forces"] = [ - self.get_array("forces", i) for i in range(len(self)) - ] - if self.has_array("stress"): - self._table_cache["stress"] = [ - self.get_array("stress", i) for i in range(len(self)) - ] - self._table_cache["number_of_atoms"] = [ - len(s) for s in self._table_cache.atoms - ] - return self._table_cache - - def include_job( - self, - job, - iteration_step=-1, - hdf_keys=None, - ): - """ - Add structure, energy, forces and pressures from an inspected or loaded job. - - The job must be an atomistic job. - - Forces and stresses are only added if present in the output. - - The locations in the job HDF5 file from which the training data is read - can be customized by passing a dictionary as `hdf_keys`, where the - values must be paths inside the job HDF5 file. The available keys are - given below together with their default values. - - * `energy`: `output/generic/energy_pot` - * `forces`: `output/generic/forces` - * `stress`: `output/generic/pressures` - * `indices`: `output/generic/indices` - * `cell`: `output/generic/cells` - * `positions`: `output/generic/positions` - * `species`: `input/structure/species` - * `pbc`: `input/structure/cell/pbc` - - Other keys are ignored. All entries except `pbc` and `species` must - lead to arrays that can be indexed by `iteration_step`. - - For `Vasp` jobs the defaults are changed like this - - * `energy`: `output/generic/dft/energy_free` - * `stress`: `output/generic/stresses` - - to ensure that by default energies, forces and stresses are consistent. - - Args: - job (:class:`.JobPath`, :class:`.AtomisticGenericJob`): job (path) to take structure from - iteration_step (int, optional): if job has multiple steps, this selects which to add - hdf_keys (dict of str): customize where values are read from the - job HDF5 file - """ - - hdf_keys = _HDF_KEYS.copy() - hdf_keys.update(_JOB_HDF_OVERLAY_KEYS.get(job["NAME"], {})) - - kwargs = { - "energy": job[hdf_keys["energy"]][iteration_step], - } - ff = job[hdf_keys["forces"]] - if ff is not None: - kwargs["forces"] = ff[iteration_step] - - pp = job[hdf_keys["stress"]] - if pp is not None and len(pp) > 0: - stress = np.asarray(pp[iteration_step]) - if stress.shape == (3, 3): - stress = np.array( - [ - stress[0, 0], - stress[1, 1], - stress[2, 2], - stress[1, 2], - stress[0, 2], - stress[0, 1], - ] - ) - kwargs["stress"] = stress - - ii = job[hdf_keys["indices"]] - if ii is not None: - indices = ii[iteration_step] - else: - # not all jobs save indices again in the output, but all atomistic - # jobs do it in the input - indices = job["input/structure/indices"] - species = np.asarray(job[hdf_keys["species"]]) - cell = job[hdf_keys["cell"]][iteration_step] - positions = job[hdf_keys["positions"]][iteration_step] - pbc = job[hdf_keys["pbc"]] - - self.add_chunk( - len(indices), - identifier=job.name, - symbols=species[indices], - positions=positions, - cell=[cell], - pbc=[pbc], - **kwargs, - ) - - @deprecate("Use add_structure instead") - def include_structure(self, structure, energy, name=None, **properties): - """ - Add new structure to structure list and save energy and forces with it. - - For consistency with the rest of pyiron, energy should be in units of eV and forces in eV/A, but no conversion - is performed. - - Args: - structure_or_job (:class:`~.Atoms`): structure to add - energy (float): energy of the whole structure - forces (Nx3 array of float, optional): per atom forces, where N is the number of atoms in the structure - stress (6 array of float, optional): per structure stresses in voigt notation - name (str, optional): name describing the structure - """ - self.add_structure(structure, identifier=name, energy=energy, **properties) - - def add_structure( - self, structure: Atoms, energy, identifier=None, **arrays - ) -> None: - if "forces" in arrays and not self.has_array("forces"): - self.add_array( - "forces", shape=(3,), dtype=np.float64, per="element", fill=np.nan - ) - if "stress" in arrays and not self.has_array("stress"): - # save stress in voigt notation - self.add_array( - "stress", shape=(6,), dtype=np.float64, per="chunk", fill=np.nan - ) - super().add_structure(structure, identifier=identifier, energy=energy, **arrays) - - def include_dataset(self, dataset): - """ - Add a pandas DataFrame to the saved structures. - - The dataframe should have the following columns: - - name: human readable name of the structure - - atoms(:class:`ase.Atoms`): the atomic structure - - energy(float): energy of the whole structure - - forces (Nx3 array of float): per atom forces, where N is the number of atoms in the structure - - charges (Nx3 array of floats): - - stress (6 array of float): per structure stress in voigt notation - """ - if ( - "name" not in dataset.columns - or "atoms" not in dataset.columns - or "energy" not in dataset.columns - ): - raise ValueError( - "At least columns 'name', 'atoms' and 'energy' must be present in dataset!" - ) - for row in dataset.itertuples(index=False): - kwargs = {} - if hasattr(row, "forces"): - kwargs["forces"] = row.forces - if hasattr(row, "stress"): - kwargs["stress"] = row.stress - self.add_structure( - row.atoms, energy=row.energy, identifier=row.name, **kwargs - ) - - def to_list(self, filter_function=None): - """ - Returns the data as lists of pyiron structures, energies, forces, and the number of atoms - - Args: - filter_function (function): Function applied to the dataset (which is a pandas DataFrame) to filter it - - Returns: - tuple: list of structures, energies, forces, and the number of atoms - """ - data_table = self.to_pandas() - if filter_function is not None: - data_table = filter_function(data_table) - structure_list = data_table.atoms.to_list() - energy_list = data_table.energy.to_list() - if "forces" not in data_table.columns: - raise ValueError("no forces defined in storage; call to_dict() instead.") - force_list = data_table.forces.to_list() - num_atoms_list = data_table.number_of_atoms.to_list() - - return (structure_list, energy_list, force_list, num_atoms_list) - - def to_dict(self) -> Dict[str, Any]: - """Return a dictionary of all structures and training properties.""" - dict_arrays = {} - - # Get structure information. - dict_arrays["structure"] = list(self.iter_structures()) - - # Some arrays are only for internal usage or structure information that - # was already saved in dict['structure']. - internal_arrays = [ - "start_index", - "length", - "cell", - "pbc", - "positions", - "symbols", - ] - for array in self.list_arrays(): - # Skip internal arrays. - if array in internal_arrays: - continue - - dict_arrays[array] = self.get_array_ragged(array) - return dict_arrays - - def iter(self, *arrays, wrap_atoms=True): - """ - Iterate over all structures in this object and all arrays that are defined - - Args: - wrap_atoms (bool): True if the atoms are to be wrapped back into the unit cell; passed to - :meth:`.get_structure()` - *arrays (str): name of arrays that should be iterated over - - Yields: - :class:`pyiron_atomistics.atomistitcs.structure.atoms.Atoms`, arrays: every structure attached to the object and queried arrays - """ - array_vals = (self.get_array_ragged(a) for a in arrays) - yield from zip(self.iter_structures(), *array_vals) - - @property - def plot(self): - """ - :class:`.TrainingPlots`: plotting interface - """ - if self._plots is None: - self._plots = TrainingPlots(self) - return self._plots - - def train_test_split(self, train_size, seed=None): - """ - Split into two random sub sets for training and testing. - - Args: - train_size (float): fraction of data points for the training set, must be within (0, 1) - seed (optional): how to initialize the RNG, see numpy.random.default_rng() for details, but an int will do - - Returns: - (TrainingStorage, TrainingStorage): two training storages for training and testing - """ - if not (0 < train_size < 1): - raise ValueError(f"train_size must be within (0,1), not {train_size}!") - rng = np.random.default_rng(seed) - brk = int(len(self) * train_size) - if brk in (0, 1): - raise ValueError( - f"container not large enough to realize this split, only multiples of {1/len(self)} possible!" - ) - - # somewhat inefficient, but probably good enough for normal training set sizes - idx = np.arange(len(self)) - rng.shuffle(idx) - test_idx = idx[:brk] - train_idx = idx[brk:] - return ( - self.sample(lambda f, i: i in test_idx), - self.sample(lambda f, i: i in train_idx) - ) From 9c07e026806a64c9d5e3c2c2d6a7eda888cbbbda Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Mon, 24 Jun 2024 20:38:55 +0200 Subject: [PATCH 3/5] TrainingContainer: Add train_test_split method Calls the corresponding method on TrainingStorage. --- .../atomistics/job/trainingcontainer.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/pyiron_potentialfit/atomistics/job/trainingcontainer.py b/pyiron_potentialfit/atomistics/job/trainingcontainer.py index 5913c323..399fb607 100644 --- a/pyiron_potentialfit/atomistics/job/trainingcontainer.py +++ b/pyiron_potentialfit/atomistics/job/trainingcontainer.py @@ -624,6 +624,47 @@ def iter(self, *arrays, wrap_atoms=True): """ yield from self._container.iter(*arrays, wrap_atoms=wrap_atoms) + def train_test_split(self, train_size: float, seed=None, + run: bool = True, + train_name: str = None, test_name: str = None, project=None): + """ + Split into two random sub sets for training and testing. + + Args: + train_size (float): fraction of data points for the training set, must be within (0, 1) + seed (optional): how to initialize the RNG, see numpy.random.default_rng() for details, but an int will do + run (bool, optional): whether to immediately run and save the new containers + train_name (str, optional): default is the name of this container with suffix _train + test_name (str, optional): default is the name of this container with suffix _test + project (Project, optional): where to create the new containers; defaults to this project + + Returns: + (TrainingContainer, TrainingContainer): two training storages for training and testing + + Raises: + ValueError: if either `train_name` or `test_name` already exist in `project`. + ValueError: from :meth:`.TrainingStorage.train_test_split` if `train_size` cannot be realized + """ + if project is None: + project = self.project + if train_name is None: + train_name = f"{self.name}_train" + if test_name is None: + test_name = f"{self.name}_test" + if len({train_name, test_name}.intersection(project.list_nodes())) > 0: + raise ValueError("Target containers already exist!") + + train, test = self._container.train_test_split(train_size, seed) + trainc = project.create.job.TrainingContainer(train_name) + trainc.include_storage(train) + testc = project.create.job.TrainingContainer(test_name) + testc.include_storage(test) + + if run: + trainc.run() + testc.run() + return trainc, testc + class TrainingPlots(StructurePlots): """ From aa8edf83769589f7a9781ba292dee2d765d4b63c Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Tue, 25 Jun 2024 09:51:47 +0200 Subject: [PATCH 4/5] Ensure neighbor compatibility --- .../atomistics/job/trainingcontainer.py | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/pyiron_potentialfit/atomistics/job/trainingcontainer.py b/pyiron_potentialfit/atomistics/job/trainingcontainer.py index 399fb607..5f6f3fdb 100644 --- a/pyiron_potentialfit/atomistics/job/trainingcontainer.py +++ b/pyiron_potentialfit/atomistics/job/trainingcontainer.py @@ -359,11 +359,11 @@ def train_test_split(self, train_size, seed=None): # somewhat inefficient, but probably good enough for normal training set sizes idx = np.arange(len(self)) rng.shuffle(idx) - test_idx = idx[:brk] - train_idx = idx[brk:] + train_idx = idx[:brk] + test_idx = idx[brk:] return ( + self.sample(lambda f, i: i in train_idx), self.sample(lambda f, i: i in test_idx), - self.sample(lambda f, i: i in train_idx) ) @@ -458,6 +458,15 @@ def include_storage(self, storage: TrainingStorage): Args: storage (:class:`.TrainingStorage`): structures to add """ + # Check whether storage defines neighbor information and whether it is + # compatible without our input + info = storage.has_array("indices") + if info is not None and info["shape"][0] != self.input.num_neighbors: + storage = storage.copy() + storage.del_array("indices") + storage.del_array("distances") + storage.del_array("vecs") + storage.del_array("shells") self._container.extend(storage) def _get_structure(self, frame=-1, wrap_atoms=True): @@ -656,8 +665,16 @@ def train_test_split(self, train_size: float, seed=None, train, test = self._container.train_test_split(train_size, seed) trainc = project.create.job.TrainingContainer(train_name) - trainc.include_storage(train) testc = project.create.job.TrainingContainer(test_name) + + # make sure that the split containers do not try to override any + # neighbor information we may have saved before + trainc.input.save_neighbors = self.input.save_neighbors + trainc.input.num_neighbors = self.input.num_neighbors + testc.input.save_neighbors = self.input.save_neighbors + testc.input.num_neighbors = self.input.num_neighbors + + trainc.include_storage(train) testc.include_storage(test) if run: From 227059063ff1b197daaf768522e4b7c6609c7f85 Mon Sep 17 00:00:00 2001 From: pyiron-runner Date: Tue, 25 Jun 2024 08:08:28 +0000 Subject: [PATCH 5/5] Format black --- .../atomistics/job/trainingcontainer.py | 18 ++++++++++++------ pyiron_potentialfit/spgfit/learn.py | 2 +- pyiron_potentialfit/spgfit/structures.py | 11 ++++++++--- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/pyiron_potentialfit/atomistics/job/trainingcontainer.py b/pyiron_potentialfit/atomistics/job/trainingcontainer.py index 5f6f3fdb..21ccea90 100644 --- a/pyiron_potentialfit/atomistics/job/trainingcontainer.py +++ b/pyiron_potentialfit/atomistics/job/trainingcontainer.py @@ -353,7 +353,7 @@ def train_test_split(self, train_size, seed=None): brk = int(len(self) * train_size) if brk in (0, 1): raise ValueError( - f"container not large enough to realize this split, only multiples of {1/len(self)} possible!" + f"container not large enough to realize this split, only multiples of {1/len(self)} possible!" ) # somewhat inefficient, but probably good enough for normal training set sizes @@ -362,8 +362,8 @@ def train_test_split(self, train_size, seed=None): train_idx = idx[:brk] test_idx = idx[brk:] return ( - self.sample(lambda f, i: i in train_idx), - self.sample(lambda f, i: i in test_idx), + self.sample(lambda f, i: i in train_idx), + self.sample(lambda f, i: i in test_idx), ) @@ -633,9 +633,15 @@ def iter(self, *arrays, wrap_atoms=True): """ yield from self._container.iter(*arrays, wrap_atoms=wrap_atoms) - def train_test_split(self, train_size: float, seed=None, - run: bool = True, - train_name: str = None, test_name: str = None, project=None): + def train_test_split( + self, + train_size: float, + seed=None, + run: bool = True, + train_name: str = None, + test_name: str = None, + project=None, + ): """ Split into two random sub sets for training and testing. diff --git a/pyiron_potentialfit/spgfit/learn.py b/pyiron_potentialfit/spgfit/learn.py index eed4cabb..27d2b308 100644 --- a/pyiron_potentialfit/spgfit/learn.py +++ b/pyiron_potentialfit/spgfit/learn.py @@ -254,7 +254,7 @@ def energy_mae(j): N = inpt.get_array("length") train = np.squeeze(inpt.get_array("energy")) / N pred = np.squeeze(j["output/training_efs"].to_object().get_array("energy")) / N - return np.abs(train-pred).mean() + return np.abs(train - pred).mean() def energy_max(j): diff --git a/pyiron_potentialfit/spgfit/structures.py b/pyiron_potentialfit/spgfit/structures.py index ed862ced..33027eda 100644 --- a/pyiron_potentialfit/spgfit/structures.py +++ b/pyiron_potentialfit/spgfit/structures.py @@ -857,9 +857,10 @@ def main(): help="Retry the current step from scratch", ) parser.add_argument( - "--export", type=str, + "--export", + type=str, help="Optionally specify a directory where to dump POSCAR files with the generated structures after everything " - "is finished." + "is finished.", ) args = parser.parse_args() @@ -892,7 +893,11 @@ def main(): dir_path = os.path.join(args.export, cont.name) os.makedirs(dir_path, exist_ok=True) for i, s in enumerate(cont.iter_structures()): - s.write(os.path.join(dir_path, cont._container["identifier", i]) + ".POSCAR", format="vasp") + s.write( + os.path.join(dir_path, cont._container["identifier", i]) + + ".POSCAR", + format="vasp", + ) if __name__ == "__main__":