From 6d492bbb28cdfa4d77ca15a47fdbb4ba6aa8a616 Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Mon, 24 Jun 2024 20:19:54 +0200
Subject: [PATCH 1/5] TrainingStorage: Add train_test_split method

---
 .../atomistics/job/trainingcontainer.py       | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/pyiron_potentialfit/atomistics/job/trainingcontainer.py b/pyiron_potentialfit/atomistics/job/trainingcontainer.py
index ea8f29fa..0d09e562 100644
--- a/pyiron_potentialfit/atomistics/job/trainingcontainer.py
+++ b/pyiron_potentialfit/atomistics/job/trainingcontainer.py
@@ -658,3 +658,33 @@ def plot(self):
         if self._plots is None:
             self._plots = TrainingPlots(self)
         return self._plots
+
+    def train_test_split(self, train_size, seed=None):
+        """
+        Split into two random sub sets for training and testing.
+
+        Args:
+            train_size (float): fraction of data points for the training set, must be within (0, 1)
+            seed (optional): how to initialize the RNG, see numpy.random.default_rng() for details, but an int will do
+
+        Returns:
+            (TrainingStorage, TrainingStorage): two training storages for training and testing
+        """
+        if not (0 < train_size < 1):
+            raise ValueError(f"train_size must be within (0,1), not {train_size}!")
+        rng = np.random.default_rng(seed)
+        brk = int(len(self) * train_size)
+        if brk in (0, 1):
+            raise ValueError(
+                    f"container not large enough to realize this split, only multiples of {1/len(self)} possible!"
+            )
+
+        # somewhat inefficient, but probably good enough for normal training set sizes
+        idx = np.arange(len(self))
+        rng.shuffle(idx)
+        test_idx = idx[:brk]
+        train_idx = idx[brk:]
+        return (
+                self.sample(lambda f, i: i in test_idx),
+                self.sample(lambda f, i: i in train_idx)
+        )

From 03a49127f4228f3286760e123e884ef8fd16c6f7 Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Mon, 24 Jun 2024 21:22:33 +0200
Subject: [PATCH 2/5] TrainingContainer: Add include_storage method

Adds a TrainingStorage to a TrainingContainer, similar to include_dataset.
Move Container beneath Storage definitions for type hinting
---
 .../atomistics/job/trainingcontainer.py       | 797 +++++++++---------
 1 file changed, 403 insertions(+), 394 deletions(-)

diff --git a/pyiron_potentialfit/atomistics/job/trainingcontainer.py b/pyiron_potentialfit/atomistics/job/trainingcontainer.py
index 0d09e562..5913c323 100644
--- a/pyiron_potentialfit/atomistics/job/trainingcontainer.py
+++ b/pyiron_potentialfit/atomistics/job/trainingcontainer.py
@@ -45,55 +45,167 @@
 from pyiron_snippets.deprecate import deprecate
 
 
-class TrainingContainer(GenericJob, HasStructure):
-    """
-    Stores ASE structures with energies and forces.
-    """
+_HDF_KEYS = {
+    "energy": "output/generic/energy_pot",
+    "forces": "output/generic/forces",
+    "stress": "output/generic/pressures",
+    "indices": "output/generic/indices",
+    "species": "input/structure/species",
+    "cell": "output/generic/cells",
+    "positions": "output/generic/positions",
+    "pbc": "input/structure/cell/pbc",
+}
 
-    def __init__(self, project, job_name):
-        super().__init__(project=project, job_name=job_name)
-        self.__name__ = "TrainingContainer"
-        self.__hdf_version__ = "0.3.0"
-        self._container = TrainingStorage()
+_JOB_HDF_OVERLAY_KEYS = {
+    "Vasp": {
+        # For DFT one should use the smeared energy to obtain values
+        # consistent with the forces, but the default energy_pot of DFT
+        # jobs is the energy extrapolated to zero smearing
+        "energy": "output/generic/dft/energy_free",
+        # HACK: VASP work-around, current contents of pressures are meaningless, correct values are in
+        # output/generic/stresses
+        "stress": "output/generic/stresses",
+    }
+}
 
-        self.input = DataContainer(
-            {"save_neighbors": True, "num_neighbors": 12}, table_name="parameters"
-        )
 
-    def include_job(self, job, iteration_step=-1):
+class TrainingStorage(StructureStorage):
+    def __init__(self):
+        super().__init__()
+        self.add_array("energy", dtype=np.float64, per="chunk", fill=np.nan)
+        self._table_cache = None
+        self.to_pandas()
+
+    def to_pandas(self):
         """
-        Add structure, energy and forces from job.
+        Export list of structure to pandas table for external fitting codes.
 
-        Args:
-            job (:class:`.AtomisticGenericJob`): job to take structure from
-            iteration_step (int, optional): if job has multiple steps, this
-            selects which to add
+        The table contains the following columns:
+            - 'name': human-readable name of the structure
+            - 'ase_atoms': the structure as a :class:`.Atoms` object
+            - 'energy': the energy of the full structure
+            - 'forces': the per atom forces as a :class:`numpy.ndarray`, shape Nx3
+            - 'stress': the per structure stress as a :class:`numpy.ndarray`, shape 6
+            - 'number_of_atoms': the number of atoms in the structure, N
+
+        Returns:
+            :class:`pandas.DataFrame`: collected structures
         """
-        self._container.include_job(job, iteration_step)
+        if self._table_cache is None or len(self._table_cache) != len(self):
+            self._table_cache = pd.DataFrame(
+                {
+                    "name": [self.get_array("identifier", i) for i in range(len(self))],
+                    "atoms": [self.get_structure(i) for i in range(len(self))],
+                    "energy": [self.get_array("energy", i) for i in range(len(self))],
+                }
+            )
+            if self.has_array("forces"):
+                self._table_cache["forces"] = [
+                    self.get_array("forces", i) for i in range(len(self))
+                ]
+            if self.has_array("stress"):
+                self._table_cache["stress"] = [
+                    self.get_array("stress", i) for i in range(len(self))
+                ]
+            self._table_cache["number_of_atoms"] = [
+                len(s) for s in self._table_cache.atoms
+            ]
+        return self._table_cache
 
-    def include_structure(self, structure, energy=None, name=None, **properties):
+    def include_job(
+        self,
+        job,
+        iteration_step=-1,
+        hdf_keys=None,
+    ):
         """
-        Add new structure to structure list and save energy and forces with it.
+        Add structure, energy, forces and pressures from an inspected or loaded job.
 
-        For consistency with the rest of pyiron, energy should be in units of eV
-        and forces in eV/A, but no conversion is performed.
+        The job must be an atomistic job.
+
+        Forces and stresses are only added if present in the output.
+
+        The locations in the job HDF5 file from which the training data is read
+        can be customized by passing a dictionary as `hdf_keys`, where the
+        values must be paths inside the job HDF5 file.  The available keys are
+        given below together with their default values.
+
+        * `energy`: `output/generic/energy_pot`
+        * `forces`: `output/generic/forces`
+        * `stress`: `output/generic/pressures`
+        * `indices`: `output/generic/indices`
+        * `cell`: `output/generic/cells`
+        * `positions`: `output/generic/positions`
+        * `species`: `input/structure/species`
+        * `pbc`: `input/structure/cell/pbc`
+
+        Other keys are ignored.  All entries except `pbc` and `species` must
+        lead to arrays that can be indexed by `iteration_step`.
+
+        For `Vasp` jobs the defaults are changed like this
+
+        * `energy`: `output/generic/dft/energy_free`
+        * `stress`: `output/generic/stresses`
+
+        to ensure that by default energies, forces and stresses are consistent.
 
         Args:
-            structure_or_job (:class:`~.Atoms`): structure to add
-            energy (float): energy of the whole structure
-            forces (Nx3 array of float, optional): per atom forces, where N is
-                the number of atoms in the structure
-            stress (6 array of float, optional): per structure stresses in voigt
-                notation
-            name (str, optional): name describing the structure
+            job (:class:`.JobPath`, :class:`.AtomisticGenericJob`): job (path) to take structure from
+            iteration_step (int, optional): if job has multiple steps, this selects which to add
+            hdf_keys (dict of str): customize where values are read from the
+                                    job HDF5 file
         """
-        self._container.include_structure(
-            structure, name=name, energy=energy, **properties
+
+        hdf_keys = _HDF_KEYS.copy()
+        hdf_keys.update(_JOB_HDF_OVERLAY_KEYS.get(job["NAME"], {}))
+
+        kwargs = {
+            "energy": job[hdf_keys["energy"]][iteration_step],
+        }
+        ff = job[hdf_keys["forces"]]
+        if ff is not None:
+            kwargs["forces"] = ff[iteration_step]
+
+        pp = job[hdf_keys["stress"]]
+        if pp is not None and len(pp) > 0:
+            stress = np.asarray(pp[iteration_step])
+            if stress.shape == (3, 3):
+                stress = np.array(
+                    [
+                        stress[0, 0],
+                        stress[1, 1],
+                        stress[2, 2],
+                        stress[1, 2],
+                        stress[0, 2],
+                        stress[0, 1],
+                    ]
+                )
+            kwargs["stress"] = stress
+
+        ii = job[hdf_keys["indices"]]
+        if ii is not None:
+            indices = ii[iteration_step]
+        else:
+            # not all jobs save indices again in the output, but all atomistic
+            # jobs do it in the input
+            indices = job["input/structure/indices"]
+        species = np.asarray(job[hdf_keys["species"]])
+        cell = job[hdf_keys["cell"]][iteration_step]
+        positions = job[hdf_keys["positions"]][iteration_step]
+        pbc = job[hdf_keys["pbc"]]
+
+        self.add_chunk(
+            len(indices),
+            identifier=job.name,
+            symbols=species[indices],
+            positions=positions,
+            cell=[cell],
+            pbc=[pbc],
+            **kwargs,
         )
 
-    def add_structure(
-        self, structure, energy, forces=None, stress=None, identifier=None, **arrays
-    ):
+    @deprecate("Use add_structure instead")
+    def include_structure(self, structure, energy, name=None, **properties):
         """
         Add new structure to structure list and save energy and forces with it.
 
@@ -107,14 +219,21 @@ def add_structure(
             stress (6 array of float, optional): per structure stresses in voigt notation
             name (str, optional): name describing the structure
         """
-        self._container.add_structure(
-            structure,
-            energy,
-            identifier=identifier,
-            forces=forces,
-            stress=stress,
-            **arrays,
-        )
+        self.add_structure(structure, identifier=name, energy=energy, **properties)
+
+    def add_structure(
+        self, structure: Atoms, energy, identifier=None, **arrays
+    ) -> None:
+        if "forces" in arrays and not self.has_array("forces"):
+            self.add_array(
+                "forces", shape=(3,), dtype=np.float64, per="element", fill=np.nan
+            )
+        if "stress" in arrays and not self.has_array("stress"):
+            # save stress in voigt notation
+            self.add_array(
+                "stress", shape=(6,), dtype=np.float64, per="chunk", fill=np.nan
+            )
+        super().add_structure(structure, identifier=identifier, energy=energy, **arrays)
 
     def include_dataset(self, dataset):
         """
@@ -125,44 +244,256 @@ def include_dataset(self, dataset):
             - atoms(:class:`ase.Atoms`): the atomic structure
             - energy(float): energy of the whole structure
             - forces (Nx3 array of float): per atom forces, where N is the number of atoms in the structure
+            - charges (Nx3 array of floats):
             - stress (6 array of float): per structure stress in voigt notation
         """
-        self._container.include_dataset(dataset)
-
-    def _get_structure(self, frame=-1, wrap_atoms=True):
-        return self._container.get_structure(frame=frame, wrap_atoms=wrap_atoms)
-
-    def _number_of_structures(self):
-        return self._container.number_of_structures
+        if (
+            "name" not in dataset.columns
+            or "atoms" not in dataset.columns
+            or "energy" not in dataset.columns
+        ):
+            raise ValueError(
+                "At least columns 'name', 'atoms' and 'energy' must be present in dataset!"
+            )
+        for row in dataset.itertuples(index=False):
+            kwargs = {}
+            if hasattr(row, "forces"):
+                kwargs["forces"] = row.forces
+            if hasattr(row, "stress"):
+                kwargs["stress"] = row.stress
+            self.add_structure(
+                row.atoms, energy=row.energy, identifier=row.name, **kwargs
+            )
 
-    def get_neighbors(self, num_neighbors=None):
+    def to_list(self, filter_function=None):
         """
-        Calculate and add neighbor information in each structure.
-
-        If input.save_neighbors is True the data is automatically added to the internal storage and will be saved
-        together with the normal structure data.
+        Returns the data as lists of pyiron structures, energies, forces, and the number of atoms
 
         Args:
-            num_neighbors (int, optional): Number of neighbors to collect, if not given use value from input
+            filter_function (function): Function applied to the dataset (which is a pandas DataFrame) to filter it
 
         Returns:
-            NeighborsTrajectory: neighbor information
-        """
-        if num_neighbors is None:
-            num_neighbors = self.input.num_neighbors
-        n = NeighborsTrajectory(
-            has_structure=self,
-            store=self._container if self.input.save_neighbors else None,
-            num_neighbors=num_neighbors,
-        )
-        n.compute_neighbors()
-        return n
-
-    def get_elements(self):
+            tuple: list of structures, energies, forces, and the number of atoms
         """
-        Return a list of chemical elements in the training set.
-
-        Returns:
+        data_table = self.to_pandas()
+        if filter_function is not None:
+            data_table = filter_function(data_table)
+        structure_list = data_table.atoms.to_list()
+        energy_list = data_table.energy.to_list()
+        if "forces" not in data_table.columns:
+            raise ValueError("no forces defined in storage; call to_dict() instead.")
+        force_list = data_table.forces.to_list()
+        num_atoms_list = data_table.number_of_atoms.to_list()
+
+        return (structure_list, energy_list, force_list, num_atoms_list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return a dictionary of all structures and training properties."""
+        dict_arrays = {}
+
+        # Get structure information.
+        dict_arrays["structure"] = list(self.iter_structures())
+
+        # Some arrays are only for internal usage or structure information that
+        # was already saved in dict['structure'].
+        internal_arrays = [
+            "start_index",
+            "length",
+            "cell",
+            "pbc",
+            "positions",
+            "symbols",
+        ]
+        for array in self.list_arrays():
+            # Skip internal arrays.
+            if array in internal_arrays:
+                continue
+
+            dict_arrays[array] = self.get_array_ragged(array)
+        return dict_arrays
+
+    def iter(self, *arrays, wrap_atoms=True):
+        """
+        Iterate over all structures in this object and all arrays that are defined
+
+        Args:
+            wrap_atoms (bool): True if the atoms are to be wrapped back into the unit cell; passed to
+                               :meth:`.get_structure()`
+            *arrays (str): name of arrays that should be iterated over
+
+        Yields:
+            :class:`pyiron_atomistics.atomistitcs.structure.atoms.Atoms`, arrays: every structure attached to the object and queried arrays
+        """
+        array_vals = (self.get_array_ragged(a) for a in arrays)
+        yield from zip(self.iter_structures(), *array_vals)
+
+    @property
+    def plot(self):
+        """
+        :class:`.TrainingPlots`: plotting interface
+        """
+        if self._plots is None:
+            self._plots = TrainingPlots(self)
+        return self._plots
+
+    def train_test_split(self, train_size, seed=None):
+        """
+        Split into two random sub sets for training and testing.
+
+        Args:
+            train_size (float): fraction of data points for the training set, must be within (0, 1)
+            seed (optional): how to initialize the RNG, see numpy.random.default_rng() for details, but an int will do
+
+        Returns:
+            (TrainingStorage, TrainingStorage): two training storages for training and testing
+        """
+        if not (0 < train_size < 1):
+            raise ValueError(f"train_size must be within (0,1), not {train_size}!")
+        rng = np.random.default_rng(seed)
+        brk = int(len(self) * train_size)
+        if brk in (0, 1):
+            raise ValueError(
+                    f"container not large enough to realize this split, only multiples of {1/len(self)} possible!"
+            )
+
+        # somewhat inefficient, but probably good enough for normal training set sizes
+        idx = np.arange(len(self))
+        rng.shuffle(idx)
+        test_idx = idx[:brk]
+        train_idx = idx[brk:]
+        return (
+                self.sample(lambda f, i: i in test_idx),
+                self.sample(lambda f, i: i in train_idx)
+        )
+
+
+class TrainingContainer(GenericJob, HasStructure):
+    """
+    Stores ASE structures with energies and forces.
+    """
+
+    def __init__(self, project, job_name):
+        super().__init__(project=project, job_name=job_name)
+        self.__name__ = "TrainingContainer"
+        self.__hdf_version__ = "0.3.0"
+        self._container = TrainingStorage()
+
+        self.input = DataContainer(
+            {"save_neighbors": True, "num_neighbors": 12}, table_name="parameters"
+        )
+
+    def include_job(self, job, iteration_step=-1):
+        """
+        Add structure, energy and forces from job.
+
+        Args:
+            job (:class:`.AtomisticGenericJob`): job to take structure from
+            iteration_step (int, optional): if job has multiple steps, this
+            selects which to add
+        """
+        self._container.include_job(job, iteration_step)
+
+    def include_structure(self, structure, energy=None, name=None, **properties):
+        """
+        Add new structure to structure list and save energy and forces with it.
+
+        For consistency with the rest of pyiron, energy should be in units of eV
+        and forces in eV/A, but no conversion is performed.
+
+        Args:
+            structure_or_job (:class:`~.Atoms`): structure to add
+            energy (float): energy of the whole structure
+            forces (Nx3 array of float, optional): per atom forces, where N is
+                the number of atoms in the structure
+            stress (6 array of float, optional): per structure stresses in voigt
+                notation
+            name (str, optional): name describing the structure
+        """
+        self._container.include_structure(
+            structure, name=name, energy=energy, **properties
+        )
+
+    def add_structure(
+        self, structure, energy, forces=None, stress=None, identifier=None, **arrays
+    ):
+        """
+        Add new structure to structure list and save energy and forces with it.
+
+        For consistency with the rest of pyiron, energy should be in units of eV and forces in eV/A, but no conversion
+        is performed.
+
+        Args:
+            structure_or_job (:class:`~.Atoms`): structure to add
+            energy (float): energy of the whole structure
+            forces (Nx3 array of float, optional): per atom forces, where N is the number of atoms in the structure
+            stress (6 array of float, optional): per structure stresses in voigt notation
+            name (str, optional): name describing the structure
+        """
+        self._container.add_structure(
+            structure,
+            energy,
+            identifier=identifier,
+            forces=forces,
+            stress=stress,
+            **arrays,
+        )
+
+    def include_dataset(self, dataset):
+        """
+        Add a pandas DataFrame to the saved structures.
+
+        The dataframe should have the following columns:
+            - name: human readable name of the structure
+            - atoms(:class:`ase.Atoms`): the atomic structure
+            - energy(float): energy of the whole structure
+            - forces (Nx3 array of float): per atom forces, where N is the number of atoms in the structure
+            - stress (6 array of float): per structure stress in voigt notation
+        """
+        self._container.include_dataset(dataset)
+
+    def include_storage(self, storage: TrainingStorage):
+        """
+        Add a :class:`.TrainingStorage` to the saved structures.
+
+        Args:
+            storage (:class:`.TrainingStorage`): structures to add
+        """
+        self._container.extend(storage)
+
+    def _get_structure(self, frame=-1, wrap_atoms=True):
+        return self._container.get_structure(frame=frame, wrap_atoms=wrap_atoms)
+
+    def _number_of_structures(self):
+        return self._container.number_of_structures
+
+    def get_neighbors(self, num_neighbors=None):
+        """
+        Calculate and add neighbor information in each structure.
+
+        If input.save_neighbors is True the data is automatically added to the internal storage and will be saved
+        together with the normal structure data.
+
+        Args:
+            num_neighbors (int, optional): Number of neighbors to collect, if not given use value from input
+
+        Returns:
+            NeighborsTrajectory: neighbor information
+        """
+        if num_neighbors is None:
+            num_neighbors = self.input.num_neighbors
+        n = NeighborsTrajectory(
+            has_structure=self,
+            store=self._container if self.input.save_neighbors else None,
+            num_neighbors=num_neighbors,
+        )
+        n.compute_neighbors()
+        return n
+
+    def get_elements(self):
+        """
+        Return a list of chemical elements in the training set.
+
+        Returns:
             :class:`list`: list of unique elements in the training set as strings of their standard abbreviations
         """
         return self._container.get_elements()
@@ -366,325 +697,3 @@ def forces(self, axis: Optional[int] = None):
             f = f.ravel()
         plt.hist(f, bins=20)
         plt.xlabel(r"Force [eV/$\mathrm{\AA}$]")
-
-
-_HDF_KEYS = {
-    "energy": "output/generic/energy_pot",
-    "forces": "output/generic/forces",
-    "stress": "output/generic/pressures",
-    "indices": "output/generic/indices",
-    "species": "input/structure/species",
-    "cell": "output/generic/cells",
-    "positions": "output/generic/positions",
-    "pbc": "input/structure/cell/pbc",
-}
-
-_JOB_HDF_OVERLAY_KEYS = {
-    "Vasp": {
-        # For DFT one should use the smeared energy to obtain values
-        # consistent with the forces, but the default energy_pot of DFT
-        # jobs is the energy extrapolated to zero smearing
-        "energy": "output/generic/dft/energy_free",
-        # HACK: VASP work-around, current contents of pressures are meaningless, correct values are in
-        # output/generic/stresses
-        "stress": "output/generic/stresses",
-    }
-}
-
-
-class TrainingStorage(StructureStorage):
-    def __init__(self):
-        super().__init__()
-        self.add_array("energy", dtype=np.float64, per="chunk", fill=np.nan)
-        self._table_cache = None
-        self.to_pandas()
-
-    def to_pandas(self):
-        """
-        Export list of structure to pandas table for external fitting codes.
-
-        The table contains the following columns:
-            - 'name': human-readable name of the structure
-            - 'ase_atoms': the structure as a :class:`.Atoms` object
-            - 'energy': the energy of the full structure
-            - 'forces': the per atom forces as a :class:`numpy.ndarray`, shape Nx3
-            - 'stress': the per structure stress as a :class:`numpy.ndarray`, shape 6
-            - 'number_of_atoms': the number of atoms in the structure, N
-
-        Returns:
-            :class:`pandas.DataFrame`: collected structures
-        """
-        if self._table_cache is None or len(self._table_cache) != len(self):
-            self._table_cache = pd.DataFrame(
-                {
-                    "name": [self.get_array("identifier", i) for i in range(len(self))],
-                    "atoms": [self.get_structure(i) for i in range(len(self))],
-                    "energy": [self.get_array("energy", i) for i in range(len(self))],
-                }
-            )
-            if self.has_array("forces"):
-                self._table_cache["forces"] = [
-                    self.get_array("forces", i) for i in range(len(self))
-                ]
-            if self.has_array("stress"):
-                self._table_cache["stress"] = [
-                    self.get_array("stress", i) for i in range(len(self))
-                ]
-            self._table_cache["number_of_atoms"] = [
-                len(s) for s in self._table_cache.atoms
-            ]
-        return self._table_cache
-
-    def include_job(
-        self,
-        job,
-        iteration_step=-1,
-        hdf_keys=None,
-    ):
-        """
-        Add structure, energy, forces and pressures from an inspected or loaded job.
-
-        The job must be an atomistic job.
-
-        Forces and stresses are only added if present in the output.
-
-        The locations in the job HDF5 file from which the training data is read
-        can be customized by passing a dictionary as `hdf_keys`, where the
-        values must be paths inside the job HDF5 file.  The available keys are
-        given below together with their default values.
-
-        * `energy`: `output/generic/energy_pot`
-        * `forces`: `output/generic/forces`
-        * `stress`: `output/generic/pressures`
-        * `indices`: `output/generic/indices`
-        * `cell`: `output/generic/cells`
-        * `positions`: `output/generic/positions`
-        * `species`: `input/structure/species`
-        * `pbc`: `input/structure/cell/pbc`
-
-        Other keys are ignored.  All entries except `pbc` and `species` must
-        lead to arrays that can be indexed by `iteration_step`.
-
-        For `Vasp` jobs the defaults are changed like this
-
-        * `energy`: `output/generic/dft/energy_free`
-        * `stress`: `output/generic/stresses`
-
-        to ensure that by default energies, forces and stresses are consistent.
-
-        Args:
-            job (:class:`.JobPath`, :class:`.AtomisticGenericJob`): job (path) to take structure from
-            iteration_step (int, optional): if job has multiple steps, this selects which to add
-            hdf_keys (dict of str): customize where values are read from the
-                                    job HDF5 file
-        """
-
-        hdf_keys = _HDF_KEYS.copy()
-        hdf_keys.update(_JOB_HDF_OVERLAY_KEYS.get(job["NAME"], {}))
-
-        kwargs = {
-            "energy": job[hdf_keys["energy"]][iteration_step],
-        }
-        ff = job[hdf_keys["forces"]]
-        if ff is not None:
-            kwargs["forces"] = ff[iteration_step]
-
-        pp = job[hdf_keys["stress"]]
-        if pp is not None and len(pp) > 0:
-            stress = np.asarray(pp[iteration_step])
-            if stress.shape == (3, 3):
-                stress = np.array(
-                    [
-                        stress[0, 0],
-                        stress[1, 1],
-                        stress[2, 2],
-                        stress[1, 2],
-                        stress[0, 2],
-                        stress[0, 1],
-                    ]
-                )
-            kwargs["stress"] = stress
-
-        ii = job[hdf_keys["indices"]]
-        if ii is not None:
-            indices = ii[iteration_step]
-        else:
-            # not all jobs save indices again in the output, but all atomistic
-            # jobs do it in the input
-            indices = job["input/structure/indices"]
-        species = np.asarray(job[hdf_keys["species"]])
-        cell = job[hdf_keys["cell"]][iteration_step]
-        positions = job[hdf_keys["positions"]][iteration_step]
-        pbc = job[hdf_keys["pbc"]]
-
-        self.add_chunk(
-            len(indices),
-            identifier=job.name,
-            symbols=species[indices],
-            positions=positions,
-            cell=[cell],
-            pbc=[pbc],
-            **kwargs,
-        )
-
-    @deprecate("Use add_structure instead")
-    def include_structure(self, structure, energy, name=None, **properties):
-        """
-        Add new structure to structure list and save energy and forces with it.
-
-        For consistency with the rest of pyiron, energy should be in units of eV and forces in eV/A, but no conversion
-        is performed.
-
-        Args:
-            structure_or_job (:class:`~.Atoms`): structure to add
-            energy (float): energy of the whole structure
-            forces (Nx3 array of float, optional): per atom forces, where N is the number of atoms in the structure
-            stress (6 array of float, optional): per structure stresses in voigt notation
-            name (str, optional): name describing the structure
-        """
-        self.add_structure(structure, identifier=name, energy=energy, **properties)
-
-    def add_structure(
-        self, structure: Atoms, energy, identifier=None, **arrays
-    ) -> None:
-        if "forces" in arrays and not self.has_array("forces"):
-            self.add_array(
-                "forces", shape=(3,), dtype=np.float64, per="element", fill=np.nan
-            )
-        if "stress" in arrays and not self.has_array("stress"):
-            # save stress in voigt notation
-            self.add_array(
-                "stress", shape=(6,), dtype=np.float64, per="chunk", fill=np.nan
-            )
-        super().add_structure(structure, identifier=identifier, energy=energy, **arrays)
-
-    def include_dataset(self, dataset):
-        """
-        Add a pandas DataFrame to the saved structures.
-
-        The dataframe should have the following columns:
-            - name: human readable name of the structure
-            - atoms(:class:`ase.Atoms`): the atomic structure
-            - energy(float): energy of the whole structure
-            - forces (Nx3 array of float): per atom forces, where N is the number of atoms in the structure
-            - charges (Nx3 array of floats):
-            - stress (6 array of float): per structure stress in voigt notation
-        """
-        if (
-            "name" not in dataset.columns
-            or "atoms" not in dataset.columns
-            or "energy" not in dataset.columns
-        ):
-            raise ValueError(
-                "At least columns 'name', 'atoms' and 'energy' must be present in dataset!"
-            )
-        for row in dataset.itertuples(index=False):
-            kwargs = {}
-            if hasattr(row, "forces"):
-                kwargs["forces"] = row.forces
-            if hasattr(row, "stress"):
-                kwargs["stress"] = row.stress
-            self.add_structure(
-                row.atoms, energy=row.energy, identifier=row.name, **kwargs
-            )
-
-    def to_list(self, filter_function=None):
-        """
-        Returns the data as lists of pyiron structures, energies, forces, and the number of atoms
-
-        Args:
-            filter_function (function): Function applied to the dataset (which is a pandas DataFrame) to filter it
-
-        Returns:
-            tuple: list of structures, energies, forces, and the number of atoms
-        """
-        data_table = self.to_pandas()
-        if filter_function is not None:
-            data_table = filter_function(data_table)
-        structure_list = data_table.atoms.to_list()
-        energy_list = data_table.energy.to_list()
-        if "forces" not in data_table.columns:
-            raise ValueError("no forces defined in storage; call to_dict() instead.")
-        force_list = data_table.forces.to_list()
-        num_atoms_list = data_table.number_of_atoms.to_list()
-
-        return (structure_list, energy_list, force_list, num_atoms_list)
-
-    def to_dict(self) -> Dict[str, Any]:
-        """Return a dictionary of all structures and training properties."""
-        dict_arrays = {}
-
-        # Get structure information.
-        dict_arrays["structure"] = list(self.iter_structures())
-
-        # Some arrays are only for internal usage or structure information that
-        # was already saved in dict['structure'].
-        internal_arrays = [
-            "start_index",
-            "length",
-            "cell",
-            "pbc",
-            "positions",
-            "symbols",
-        ]
-        for array in self.list_arrays():
-            # Skip internal arrays.
-            if array in internal_arrays:
-                continue
-
-            dict_arrays[array] = self.get_array_ragged(array)
-        return dict_arrays
-
-    def iter(self, *arrays, wrap_atoms=True):
-        """
-        Iterate over all structures in this object and all arrays that are defined
-
-        Args:
-            wrap_atoms (bool): True if the atoms are to be wrapped back into the unit cell; passed to
-                               :meth:`.get_structure()`
-            *arrays (str): name of arrays that should be iterated over
-
-        Yields:
-            :class:`pyiron_atomistics.atomistitcs.structure.atoms.Atoms`, arrays: every structure attached to the object and queried arrays
-        """
-        array_vals = (self.get_array_ragged(a) for a in arrays)
-        yield from zip(self.iter_structures(), *array_vals)
-
-    @property
-    def plot(self):
-        """
-        :class:`.TrainingPlots`: plotting interface
-        """
-        if self._plots is None:
-            self._plots = TrainingPlots(self)
-        return self._plots
-
-    def train_test_split(self, train_size, seed=None):
-        """
-        Split into two random sub sets for training and testing.
-
-        Args:
-            train_size (float): fraction of data points for the training set, must be within (0, 1)
-            seed (optional): how to initialize the RNG, see numpy.random.default_rng() for details, but an int will do
-
-        Returns:
-            (TrainingStorage, TrainingStorage): two training storages for training and testing
-        """
-        if not (0 < train_size < 1):
-            raise ValueError(f"train_size must be within (0,1), not {train_size}!")
-        rng = np.random.default_rng(seed)
-        brk = int(len(self) * train_size)
-        if brk in (0, 1):
-            raise ValueError(
-                    f"container not large enough to realize this split, only multiples of {1/len(self)} possible!"
-            )
-
-        # somewhat inefficient, but probably good enough for normal training set sizes
-        idx = np.arange(len(self))
-        rng.shuffle(idx)
-        test_idx = idx[:brk]
-        train_idx = idx[brk:]
-        return (
-                self.sample(lambda f, i: i in test_idx),
-                self.sample(lambda f, i: i in train_idx)
-        )

From 9c07e026806a64c9d5e3c2c2d6a7eda888cbbbda Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Mon, 24 Jun 2024 20:38:55 +0200
Subject: [PATCH 3/5] TrainingContainer: Add train_test_split method

Calls the corresponding method on TrainingStorage.
---
 .../atomistics/job/trainingcontainer.py       | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/pyiron_potentialfit/atomistics/job/trainingcontainer.py b/pyiron_potentialfit/atomistics/job/trainingcontainer.py
index 5913c323..399fb607 100644
--- a/pyiron_potentialfit/atomistics/job/trainingcontainer.py
+++ b/pyiron_potentialfit/atomistics/job/trainingcontainer.py
@@ -624,6 +624,47 @@ def iter(self, *arrays, wrap_atoms=True):
         """
         yield from self._container.iter(*arrays, wrap_atoms=wrap_atoms)
 
+    def train_test_split(self, train_size: float, seed=None,
+                         run: bool = True,
+                         train_name: str = None, test_name: str = None, project=None):
+        """
+        Split into two random sub sets for training and testing.
+
+        Args:
+            train_size (float): fraction of data points for the training set, must be within (0, 1)
+            seed (optional): how to initialize the RNG, see numpy.random.default_rng() for details, but an int will do
+            run (bool, optional): whether to immediately run and save the new containers
+            train_name (str, optional): default is the name of this container with suffix _train
+            test_name (str, optional): default is the name of this container with suffix _test
+            project (Project, optional): where to create the new containers; defaults to this project
+
+        Returns:
+            (TrainingContainer, TrainingContainer): two training storages for training and testing
+
+        Raises:
+            ValueError: if either `train_name` or `test_name` already exist in `project`.
+            ValueError: from :meth:`.TrainingStorage.train_test_split` if `train_size` cannot be realized
+        """
+        if project is None:
+            project = self.project
+        if train_name is None:
+            train_name = f"{self.name}_train"
+        if test_name is None:
+            test_name = f"{self.name}_test"
+        if len({train_name, test_name}.intersection(project.list_nodes())) > 0:
+            raise ValueError("Target containers already exist!")
+
+        train, test = self._container.train_test_split(train_size, seed)
+        trainc = project.create.job.TrainingContainer(train_name)
+        trainc.include_storage(train)
+        testc = project.create.job.TrainingContainer(test_name)
+        testc.include_storage(test)
+
+        if run:
+            trainc.run()
+            testc.run()
+        return trainc, testc
+
 
 class TrainingPlots(StructurePlots):
     """

From aa8edf83769589f7a9781ba292dee2d765d4b63c Mon Sep 17 00:00:00 2001
From: Marvin Poul <poul@mpie.de>
Date: Tue, 25 Jun 2024 09:51:47 +0200
Subject: [PATCH 4/5] Ensure neighbor compatibility

---
 .../atomistics/job/trainingcontainer.py       | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/pyiron_potentialfit/atomistics/job/trainingcontainer.py b/pyiron_potentialfit/atomistics/job/trainingcontainer.py
index 399fb607..5f6f3fdb 100644
--- a/pyiron_potentialfit/atomistics/job/trainingcontainer.py
+++ b/pyiron_potentialfit/atomistics/job/trainingcontainer.py
@@ -359,11 +359,11 @@ def train_test_split(self, train_size, seed=None):
         # somewhat inefficient, but probably good enough for normal training set sizes
         idx = np.arange(len(self))
         rng.shuffle(idx)
-        test_idx = idx[:brk]
-        train_idx = idx[brk:]
+        train_idx = idx[:brk]
+        test_idx = idx[brk:]
         return (
+                self.sample(lambda f, i: i in train_idx),
                 self.sample(lambda f, i: i in test_idx),
-                self.sample(lambda f, i: i in train_idx)
         )
 
 
@@ -458,6 +458,15 @@ def include_storage(self, storage: TrainingStorage):
         Args:
             storage (:class:`.TrainingStorage`): structures to add
         """
+        # Check whether storage defines neighbor information and whether it is
+        # compatible without our input
+        info = storage.has_array("indices")
+        if info is not None and info["shape"][0] != self.input.num_neighbors:
+            storage = storage.copy()
+            storage.del_array("indices")
+            storage.del_array("distances")
+            storage.del_array("vecs")
+            storage.del_array("shells")
         self._container.extend(storage)
 
     def _get_structure(self, frame=-1, wrap_atoms=True):
@@ -656,8 +665,16 @@ def train_test_split(self, train_size: float, seed=None,
 
         train, test = self._container.train_test_split(train_size, seed)
         trainc = project.create.job.TrainingContainer(train_name)
-        trainc.include_storage(train)
         testc = project.create.job.TrainingContainer(test_name)
+
+        # make sure that the split containers do not try to override any
+        # neighbor information we may have saved before
+        trainc.input.save_neighbors = self.input.save_neighbors
+        trainc.input.num_neighbors = self.input.num_neighbors
+        testc.input.save_neighbors = self.input.save_neighbors
+        testc.input.num_neighbors = self.input.num_neighbors
+
+        trainc.include_storage(train)
         testc.include_storage(test)
 
         if run:

From 227059063ff1b197daaf768522e4b7c6609c7f85 Mon Sep 17 00:00:00 2001
From: pyiron-runner <pyiron@mpie.de>
Date: Tue, 25 Jun 2024 08:08:28 +0000
Subject: [PATCH 5/5] Format black

---
 .../atomistics/job/trainingcontainer.py        | 18 ++++++++++++------
 pyiron_potentialfit/spgfit/learn.py            |  2 +-
 pyiron_potentialfit/spgfit/structures.py       | 11 ++++++++---
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/pyiron_potentialfit/atomistics/job/trainingcontainer.py b/pyiron_potentialfit/atomistics/job/trainingcontainer.py
index 5f6f3fdb..21ccea90 100644
--- a/pyiron_potentialfit/atomistics/job/trainingcontainer.py
+++ b/pyiron_potentialfit/atomistics/job/trainingcontainer.py
@@ -353,7 +353,7 @@ def train_test_split(self, train_size, seed=None):
         brk = int(len(self) * train_size)
         if brk in (0, 1):
             raise ValueError(
-                    f"container not large enough to realize this split, only multiples of {1/len(self)} possible!"
+                f"container not large enough to realize this split, only multiples of {1/len(self)} possible!"
             )
 
         # somewhat inefficient, but probably good enough for normal training set sizes
@@ -362,8 +362,8 @@ def train_test_split(self, train_size, seed=None):
         train_idx = idx[:brk]
         test_idx = idx[brk:]
         return (
-                self.sample(lambda f, i: i in train_idx),
-                self.sample(lambda f, i: i in test_idx),
+            self.sample(lambda f, i: i in train_idx),
+            self.sample(lambda f, i: i in test_idx),
         )
 
 
@@ -633,9 +633,15 @@ def iter(self, *arrays, wrap_atoms=True):
         """
         yield from self._container.iter(*arrays, wrap_atoms=wrap_atoms)
 
-    def train_test_split(self, train_size: float, seed=None,
-                         run: bool = True,
-                         train_name: str = None, test_name: str = None, project=None):
+    def train_test_split(
+        self,
+        train_size: float,
+        seed=None,
+        run: bool = True,
+        train_name: str = None,
+        test_name: str = None,
+        project=None,
+    ):
         """
         Split into two random sub sets for training and testing.
 
diff --git a/pyiron_potentialfit/spgfit/learn.py b/pyiron_potentialfit/spgfit/learn.py
index eed4cabb..27d2b308 100644
--- a/pyiron_potentialfit/spgfit/learn.py
+++ b/pyiron_potentialfit/spgfit/learn.py
@@ -254,7 +254,7 @@ def energy_mae(j):
     N = inpt.get_array("length")
     train = np.squeeze(inpt.get_array("energy")) / N
     pred = np.squeeze(j["output/training_efs"].to_object().get_array("energy")) / N
-    return np.abs(train-pred).mean()
+    return np.abs(train - pred).mean()
 
 
 def energy_max(j):
diff --git a/pyiron_potentialfit/spgfit/structures.py b/pyiron_potentialfit/spgfit/structures.py
index ed862ced..33027eda 100644
--- a/pyiron_potentialfit/spgfit/structures.py
+++ b/pyiron_potentialfit/spgfit/structures.py
@@ -857,9 +857,10 @@ def main():
         help="Retry the current step from scratch",
     )
     parser.add_argument(
-        "--export", type=str,
+        "--export",
+        type=str,
         help="Optionally specify a directory where to dump POSCAR files with the generated structures after everything "
-             "is finished."
+        "is finished.",
     )
     args = parser.parse_args()
 
@@ -892,7 +893,11 @@ def main():
             dir_path = os.path.join(args.export, cont.name)
             os.makedirs(dir_path, exist_ok=True)
             for i, s in enumerate(cont.iter_structures()):
-                s.write(os.path.join(dir_path, cont._container["identifier", i]) + ".POSCAR", format="vasp")
+                s.write(
+                    os.path.join(dir_path, cont._container["identifier", i])
+                    + ".POSCAR",
+                    format="vasp",
+                )
 
 
 if __name__ == "__main__":