Experimental support for multichain capping

Simon Lichtinger · Simon Lichtinger · commit cf8a6ef48fd9 · 2024-06-26T14:00:57.000+01:00
diff --git a/PyMEMENTO/gmx_util.py b/PyMEMENTO/gmx_util.py
@@ -379,7 +379,12 @@ def generate_posre(
             )
     else:
         # for multichain proteins, need to split into individual chains (while reassigning atom indices)
-        for chainID in set(multiple_chains):
+        prev_chain_id = None
+        for chainID in multiple_chains:
+            if chainID == prev_chain_id:
+                continue
+
+            prev_chain_id = chainID
             # Make a selection string for the residues of a particular chain ID
             include_chainres = []
             for n, res_num in enumerate(residue_numbers):
@@ -388,23 +393,32 @@ def generate_posre(
             include_res_string = " | ".join([f"r {res}" for res in include_chainres])
 
             # Output a temporary coordinate file that only includes one chain
-
             if len(exclude_res) > 0:
                 string_mda_sele = "protein and not (" + " or ".join(
-                    [f"resnum {res}" for res in exclude_res] + ")"
-                )
+                    [f"resnum {res}" for res in exclude_res]) + ")"
             else:
                 string_mda_sele = "protein"
 
             temp_chain_file = join(folder_path, "temp.gro")
 
             universe_to_process = mda.Universe(file_to_process)
             atomgroups_to_merge = []
+
+            caps_counter = 0
             for n, res in enumerate(
                 universe_to_process.select_atoms(string_mda_sele).residues
             ):
-                if multiple_chains[n] == chainID:
-                    atomgroups_to_merge.append(res.atoms)
+                if not res.resname in ["ACE", "NME"]:
+                    if multiple_chains[n-caps_counter] == chainID:
+                        atomgroups_to_merge.append(res.atoms)
+                elif res.resname == "ACE":
+                    if multiple_chains[n-caps_counter+1] == chainID:
+                        atomgroups_to_merge.append(res.atoms)
+                    caps_counter += 1
+                elif res.resname == "NME":
+                    if multiple_chains[n-caps_counter-1] == chainID:
+                        atomgroups_to_merge.append(res.atoms)
+                    caps_counter += 1
 
             universe_onechain = mda.Merge(*atomgroups_to_merge)
             universe_onechain.atoms.write(temp_chain_file)
diff --git a/PyMEMENTO/modeller_util.py b/PyMEMENTO/modeller_util.py
@@ -81,14 +81,16 @@ def create_ali_file(
 
 
 def run_modeller(
-    path: str, number_of_models: int, ali_file_name: str = "morph->protein.ali"
+    path: str, number_of_models: int,  disulphide_patches:list, ali_file_name: str = "morph->protein.ali",
 ):
     """Run modeller on the ali file contained in a specified directory.
 
     :param path: Directory in which to run modeller.
     :type path: str
     :param number_of_models: How many models to generate.
     :type number_of_models: int
+    :param disulphide_patches: List of tuples of the form (residue_number1, chain_id1, residue_number2, chain_id2) for each disulphide bond to be modelled.
+    :type disulphide_patches: list<tuple<int, str, int, str>>
     :param ali_file_name: Name of the ali file to be used, defaults to "morph->protein.ali"
     :type ali_file_name: str, optional
     """
@@ -117,6 +119,12 @@ def run_modeller(
     # Do the actual modelling based on an ali file
     env = Environ()
     a = AutoModel(env, alnfile=ali_file_name, knowns="morph", sequence="protein")
+
+    # Add disulphide bonds if necessary
+    if disulphide_patches:
+        for bond in disulphide_patches:
+            a.patch(residue_type="DISU", residues=(f"{bond[0]}:{bond[1]}", f"{bond[2]}:{bond[3]}"))
+
     a.starting_model = 1
     a.ending_model = number_of_models
     a.make()
diff --git a/PyMEMENTO/pdb_util.py b/PyMEMENTO/pdb_util.py
@@ -5,7 +5,7 @@
 
 
 def cap_termini(
-    inpath: str, outpath: str, reference: str, first_res: int, last_res: int
+    inpath: str, outpath: str, reference: str, first_res: int, last_res: int, cap_chain:str= None
 ):
     """Patch a protein with termini, as specified in a reference structure.
 
@@ -19,36 +19,75 @@ def cap_termini(
     :type first_res: int
     :param last_res: Number of the last residue in the inpath file (onto which to attach the cap).
     :type last_res: int
+    :param cap_chain: Chain ID to cap for multichain proteins. If None, the entire protein without regard to chains is called. Defaults to None.
+    :type cap_chain: str
     """
     # Load the pdb files for protein and reference termini
     term_ace = mda.Universe(reference)
     term_nme = mda.Universe(reference)
     prot = mda.Universe(inpath)
 
+    # Chain select string if needed, otherwise empty string
+    chain_select = f" and chainid {cap_chain}" if cap_chain else ""
+
+    print(f"Processing chain {cap_chain}")
+
     # Align termini to protein
     align.alignto(
         term_ace,
         prot,
         select=(
             "resid 2 and (name N or name CA or name C)",
-            f"resid {first_res} and (name N or name CA or name C)",
+            f"resid {first_res} and (name N or name CA or name C)" + chain_select,
         ),
     )
+
+    # get the first atom id of the first residue in the relevant chain
+    first_atom_id = min(prot.select_atoms(f"resid {first_res} and (name N or name CA or name C)" + chain_select)[x].id for x in range(3)) - 1
+
     align.alignto(
         term_nme,
         prot,
         select=(
             "resid 3 and (name N or name CA or name C)",
-            f"resid {last_res} and (name N or name CA or name C)",
+            f"resid {last_res} and (name N or name CA or name C)" + chain_select,
         ),
     )
 
+    # get the last atom id of the last residue in the relevant chain
+    last_atom_id = prot.select_atoms(f"resid {last_res} and (name OXT)" + chain_select)[0].id + 1
+
     # Construct new molecule
-    patched_prot = mda.Merge(
-        term_ace.select_atoms("resid 1"),
-        prot.select_atoms("not name OXT"),
-        term_nme.select_atoms("resid 4"),
-    )
+
+    # this is ugly but MDAnalysis doesns't let me merge empty selections or None
+    if len(prot.select_atoms(f"id 0-{first_atom_id}")) == 0 and len(prot.select_atoms(f"id {last_atom_id}-9999999")) > 0: 
+        patched_prot = mda.Merge(
+            term_ace.select_atoms("resid 1"),
+            prot.select_atoms("not name OXT "+chain_select),
+            term_nme.select_atoms("resid 4"),
+            prot.select_atoms(f"id {last_atom_id}-9999999")
+        )
+    elif len(prot.select_atoms(f"id {last_atom_id}-9999999")) == 0 and len(prot.select_atoms(f"id 0-{first_atom_id}")) > 0:
+        patched_prot = mda.Merge(
+            prot.select_atoms(f"id 0-{first_atom_id}"),
+            term_ace.select_atoms("resid 1"),
+            prot.select_atoms("not name OXT "+chain_select),
+            term_nme.select_atoms("resid 4")
+        )
+    elif len(prot.select_atoms(f"id {last_atom_id}-9999999")) == 0 and len(prot.select_atoms(f"id 0-{first_atom_id}")) == 0:
+        patched_prot = mda.Merge(
+            term_ace.select_atoms("resid 1"),
+            prot.select_atoms("not name OXT "+chain_select),
+            term_nme.select_atoms("resid 4"),
+        )
+    else:
+        patched_prot = mda.Merge(
+            prot.select_atoms(f"id 0-{first_atom_id}"),
+            term_ace.select_atoms("resid 1"),
+            prot.select_atoms("not name OXT "+chain_select),
+            term_nme.select_atoms("resid 4"),
+            prot.select_atoms(f"id {last_atom_id}-9999999")
+        )
 
     # write output
     patched_prot.atoms.write(outpath)
@@ -74,6 +113,8 @@ def sed(path, replace, by):
 
 def fix_residue_numbers(inpath: str, outpath: str, corrected_residue_numbers: list):
     """Modify the residue numbers in a pdb file to reach a target sequence.
+    This function will also clean up any caps that might have been propagated through
+    MODELLER, because they will not have proper geometry.
 
     :param inpath: Path to the origin pdb file
     :type inpath: str
@@ -86,15 +127,20 @@ def fix_residue_numbers(inpath: str, outpath: str, corrected_residue_numbers: li
         data = f.readlines()
 
     dataout = []
+    dropped_residue_positions = []
 
     previous_resnum = -100
     counter = -1
     for line in data:
+        # drop caps here
+        if "ACE" in line or "NME" in line:
+            continue
         if line[:4] == "ATOM":
             res_num = int(line[22:26])
             if res_num != previous_resnum:
                 previous_resnum = res_num
                 counter += 1
+
             dataout.append(
                 line[:22]
                 + str(corrected_residue_numbers[counter]).rjust(4, " ")
diff --git a/PyMEMENTO/pymemento.py b/PyMEMENTO/pymemento.py
@@ -135,19 +135,35 @@ def __init__(
 
         if multiple_chains != None:
             self.universe_start.add_TopologyAttr("chainID")
-
+            offset = 0
             for r, res in enumerate(
                 self.universe_start.select_atoms("protein").residues
             ):
-                for atom in res.atoms:
-                    atom.chainID = multiple_chains[r]
+                if res.atoms[0].resname == "ACE" or res.atoms[0].resname == "NME":
+                    for atom in res.atoms:
+                        atom.chainID = 'X'
+                    offset += 1
+                else:
+                    for atom in res.atoms:
+                        atom.chainID = multiple_chains[r-offset]
 
             self.universe_target.add_TopologyAttr("chainID")
+            offset = 0
             for r, res in enumerate(
                 self.universe_target.select_atoms("protein").residues
             ):
-                for atom in res.atoms:
-                    atom.chainID = multiple_chains[r]
+                if res.atoms[0].resname == "ACE" or res.atoms[0].resname == "NME":
+                    for atom in res.atoms:
+                        atom.chainID = 'X'
+                    offset += 1
+                    continue
+                else:
+                    for atom in res.atoms:
+                        atom.chainID = multiple_chains[r-offset]
+            
+            # now remove the X chains from the universe
+            self.universe_start = mda.Merge(self.universe_start.select_atoms("not chainid X"))
+            self.universe_target = mda.Merge(self.universe_target.select_atoms("not chainid X"))
 
         # Create working directory if necessary
         os.makedirs(working_dir, exist_ok=True)
@@ -220,7 +236,7 @@ def morph(
             else:
                 self.universe_target.atoms.write(join(local_path, "target.pdb"))
 
-            # Interpolate coordinates, (1-l)* original + l*(target-original)
+            # Interpolate coordinates, (1-l)* original + l*(target)
             for n, l in enumerate(np.linspace(0, 1, number_of_intermediates)):
                 intermediate_universe = self.universe_start.copy()
 
@@ -275,6 +291,7 @@ def make_models(
         include_residues=None,
         poolsize=12,
         mutagenesis=None,
+        disulphide_patches=None,
     ):
         """Use the modeller package to generate fixed models based on the morphs already
         present in the folder structure. Caps are removed at this stage. Mutagenesis can be
@@ -288,6 +305,10 @@ def make_models(
         :type poolsize: int, optional
         :param mutagenesis: List of tuples of the form (residue_number, original_residue, new_residue), eg. (10, 'SER', 'ALA') for S10A mutation. \
         This is not supported for multichain proteins yet. Defaults to None
+        :type mutagenesis: list<tuple<int, str, str>>, optional
+        :param disulphide_patches: List of tuples of the form (residue_number1, chain_id1, residue_number2, chain_id2), eg. (10, A, 20, B) for a disulphide bridge between C10:A and C20:B. \
+        Defaults to None
+        :type disulphide_patches: list<tuple<int, str, int, str>>, optional
         """
 
         if not self.morph_done:
@@ -345,7 +366,7 @@ def make_models(
             # The normal with statement doesn't work with pytest-cov
             pool = multiprocessing.Pool(poolsize)
             pool.starmap(
-                run_modeller, [(frame, number_of_models) for frame in frame_paths]
+                run_modeller, [(frame, number_of_models, disulphide_patches) for frame in frame_paths]
             )
             pool.close()
             pool.join()
@@ -425,13 +446,6 @@ def process_models(
         :type asp_protonation_states: list, optional
         """
 
-        # Currently multichain is not supported with caps
-
-        if caps and self.multiple_chains:
-            raise RuntimeError(
-                "Currently, combining caps and multichain proteins isn't supported."
-            )
-
         if not self.pathfinding_done:
             raise RuntimeError(
                 "Need to search for the best path before processing models."
@@ -492,25 +506,67 @@ def process_models(
                     else join(ref_folder, "termini_ref.pdb")
                 )
                 for n in range(self.number_of_intermediates):
-                    cap_termini(
-                        join(local_path, file_root + f"{n}.pdb"),
-                        join(local_path, file_root + f"capped{n}.pdb"),
-                        ref_file,
-                        self.residue_numbers[0],
-                        self.residue_numbers[-1],
-                    )
+                    if not self.multiple_chains:
+                        cap_termini(
+                            join(local_path, file_root + f"{n}.pdb"),
+                            join(local_path, file_root + f"capped{n}.pdb"),
+                            ref_file,
+                            self.residue_numbers[0],
+                            self.residue_numbers[-1],
+                        )
+
+                        # Rename the cap residues to fit the protein naming convention
+                        sed(
+                            join(local_path, file_root + f"capped{n}.pdb"),
+                            "ACE X   1",
+                            "ACE A" + str(self.residue_numbers[0] - 1).rjust(4, " "),
+                        )
+                        sed(
+                            join(local_path, file_root + f"capped{n}.pdb"),
+                            "NME X   4",
+                            "NME A" + str(self.residue_numbers[-1] + 1).rjust(4, " "),
+                        )
+                    else:
+                        # get unique chain ids from multiple_chains list in order
+                        
+                        unique_chain_ids = []
+                        for chain_id in self.multiple_chains:
+                            if chain_id not in unique_chain_ids:
+                                unique_chain_ids.append(chain_id)
+
+                        last_filename = join(local_path, file_root + f"{n}.pdb")
+                        for c, chain_id in enumerate(unique_chain_ids):
+                            # get the residue numbers that have this chain id
+                            chain_residue_numbers = [
+                                self.residue_numbers[i]
+                                for i in range(len(self.residue_numbers))
+                                if self.multiple_chains[i] == chain_id
+                            ]
+                            print(chain_residue_numbers)
+            
+                            # cap the termini for this chain, save to a temporary file
+                            cap_termini(
+                                last_filename,
+                                join(local_path, file_root + f"capped{n}.pdb"),
+                                ref_file,
+                                chain_residue_numbers[0],
+                                chain_residue_numbers[-1],
+                                cap_chain=chain_id
+                            )
+                            last_filename = join(local_path, file_root + f"capped{n}.pdb")
+
+                            # Rename the cap residues to fit the protein naming convention
+                            sed(
+                                join(local_path, file_root + f"capped{n}.pdb"),
+                                "ACE X   1",
+                                f"ACE {chain_id}" + str(chain_residue_numbers[0] - 1).rjust(4, " "),
+                            )
+                            sed(
+                                join(local_path, file_root + f"capped{n}.pdb"),
+                                "NME X   4",
+                                f"NME {chain_id}" + str(chain_residue_numbers[-1] + 1).rjust(4, " "),
+                            )
 
-                    # Rename the cap residues to fit the protein naming convention
-                    sed(
-                        join(local_path, file_root + f"capped{n}.pdb"),
-                        "ACE X   1",
-                        "ACE A" + str(self.residue_numbers[0] - 1).rjust(4, " "),
-                    )
-                    sed(
-                        join(local_path, file_root + f"capped{n}.pdb"),
-                        "NME X   4",
-                        "NME A" + str(self.residue_numbers[-1] + 1).rjust(4, " "),
-                    )
                 file_root = "framecapped"
 
             # Copy over the forcefield to our root directory for gmx to use