Skip to content

Commit cf8a6ef

Browse files
author
Simon Lichtinger
committed
Experimental support for multichain capping
1 parent 280efff commit cf8a6ef

File tree

4 files changed

+171
-47
lines changed

4 files changed

+171
-47
lines changed

PyMEMENTO/gmx_util.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,12 @@ def generate_posre(
379379
)
380380
else:
381381
# for multichain proteins, need to split into individual chains (while reassigning atom indices)
382-
for chainID in set(multiple_chains):
382+
prev_chain_id = None
383+
for chainID in multiple_chains:
384+
if chainID == prev_chain_id:
385+
continue
386+
387+
prev_chain_id = chainID
383388
# Make a selection string for the residues of a particular chain ID
384389
include_chainres = []
385390
for n, res_num in enumerate(residue_numbers):
@@ -388,23 +393,32 @@ def generate_posre(
388393
include_res_string = " | ".join([f"r {res}" for res in include_chainres])
389394

390395
# Output a temporary coordinate file that only includes one chain
391-
392396
if len(exclude_res) > 0:
393397
string_mda_sele = "protein and not (" + " or ".join(
394-
[f"resnum {res}" for res in exclude_res] + ")"
395-
)
398+
[f"resnum {res}" for res in exclude_res]) + ")"
396399
else:
397400
string_mda_sele = "protein"
398401

399402
temp_chain_file = join(folder_path, "temp.gro")
400403

401404
universe_to_process = mda.Universe(file_to_process)
402405
atomgroups_to_merge = []
406+
407+
caps_counter = 0
403408
for n, res in enumerate(
404409
universe_to_process.select_atoms(string_mda_sele).residues
405410
):
406-
if multiple_chains[n] == chainID:
407-
atomgroups_to_merge.append(res.atoms)
411+
if not res.resname in ["ACE", "NME"]:
412+
if multiple_chains[n-caps_counter] == chainID:
413+
atomgroups_to_merge.append(res.atoms)
414+
elif res.resname == "ACE":
415+
if multiple_chains[n-caps_counter+1] == chainID:
416+
atomgroups_to_merge.append(res.atoms)
417+
caps_counter += 1
418+
elif res.resname == "NME":
419+
if multiple_chains[n-caps_counter-1] == chainID:
420+
atomgroups_to_merge.append(res.atoms)
421+
caps_counter += 1
408422

409423
universe_onechain = mda.Merge(*atomgroups_to_merge)
410424
universe_onechain.atoms.write(temp_chain_file)

PyMEMENTO/modeller_util.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,16 @@ def create_ali_file(
8181

8282

8383
def run_modeller(
84-
path: str, number_of_models: int, ali_file_name: str = "morph->protein.ali"
84+
path: str, number_of_models: int, disulphide_patches:list, ali_file_name: str = "morph->protein.ali",
8585
):
8686
"""Run modeller on the ali file contained in a specified directory.
8787
8888
:param path: Directory in which to run modeller.
8989
:type path: str
9090
:param number_of_models: How many models to generate.
9191
:type number_of_models: int
92+
:param disulphide_patches: List of tuples of the form (residue_number1, chain_id1, residue_number2, chain_id2) for each disulphide bond to be modelled.
93+
:type disulphide_patches: list<tuple<int, str, int, str>>
9294
:param ali_file_name: Name of the ali file to be used, defaults to "morph->protein.ali"
9395
:type ali_file_name: str, optional
9496
"""
@@ -117,6 +119,12 @@ def run_modeller(
117119
# Do the actual modelling based on an ali file
118120
env = Environ()
119121
a = AutoModel(env, alnfile=ali_file_name, knowns="morph", sequence="protein")
122+
123+
# Add disulphide bonds if necessary
124+
if disulphide_patches:
125+
for bond in disulphide_patches:
126+
a.patch(residue_type="DISU", residues=(f"{bond[0]}:{bond[1]}", f"{bond[2]}:{bond[3]}"))
127+
120128
a.starting_model = 1
121129
a.ending_model = number_of_models
122130
a.make()

PyMEMENTO/pdb_util.py

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77
def cap_termini(
8-
inpath: str, outpath: str, reference: str, first_res: int, last_res: int
8+
inpath: str, outpath: str, reference: str, first_res: int, last_res: int, cap_chain:str= None
99
):
1010
"""Patch a protein with termini, as specified in a reference structure.
1111
@@ -19,36 +19,75 @@ def cap_termini(
1919
:type first_res: int
2020
:param last_res: Number of the last residue in the inpath file (onto which to attach the cap).
2121
:type last_res: int
22+
:param cap_chain: Chain ID to cap for multichain proteins. If None, the entire protein without regard to chains is called. Defaults to None.
23+
:type cap_chain: str
2224
"""
2325
# Load the pdb files for protein and reference termini
2426
term_ace = mda.Universe(reference)
2527
term_nme = mda.Universe(reference)
2628
prot = mda.Universe(inpath)
2729

30+
# Chain select string if needed, otherwise empty string
31+
chain_select = f" and chainid {cap_chain}" if cap_chain else ""
32+
33+
print(f"Processing chain {cap_chain}")
34+
2835
# Align termini to protein
2936
align.alignto(
3037
term_ace,
3138
prot,
3239
select=(
3340
"resid 2 and (name N or name CA or name C)",
34-
f"resid {first_res} and (name N or name CA or name C)",
41+
f"resid {first_res} and (name N or name CA or name C)" + chain_select,
3542
),
3643
)
44+
45+
# get the first atom id of the first residue in the relevant chain
46+
first_atom_id = min(prot.select_atoms(f"resid {first_res} and (name N or name CA or name C)" + chain_select)[x].id for x in range(3)) - 1
47+
3748
align.alignto(
3849
term_nme,
3950
prot,
4051
select=(
4152
"resid 3 and (name N or name CA or name C)",
42-
f"resid {last_res} and (name N or name CA or name C)",
53+
f"resid {last_res} and (name N or name CA or name C)" + chain_select,
4354
),
4455
)
4556

57+
# get the last atom id of the last residue in the relevant chain
58+
last_atom_id = prot.select_atoms(f"resid {last_res} and (name OXT)" + chain_select)[0].id + 1
59+
4660
# Construct new molecule
47-
patched_prot = mda.Merge(
48-
term_ace.select_atoms("resid 1"),
49-
prot.select_atoms("not name OXT"),
50-
term_nme.select_atoms("resid 4"),
51-
)
61+
62+
# this is ugly but MDAnalysis doesns't let me merge empty selections or None
63+
if len(prot.select_atoms(f"id 0-{first_atom_id}")) == 0 and len(prot.select_atoms(f"id {last_atom_id}-9999999")) > 0:
64+
patched_prot = mda.Merge(
65+
term_ace.select_atoms("resid 1"),
66+
prot.select_atoms("not name OXT "+chain_select),
67+
term_nme.select_atoms("resid 4"),
68+
prot.select_atoms(f"id {last_atom_id}-9999999")
69+
)
70+
elif len(prot.select_atoms(f"id {last_atom_id}-9999999")) == 0 and len(prot.select_atoms(f"id 0-{first_atom_id}")) > 0:
71+
patched_prot = mda.Merge(
72+
prot.select_atoms(f"id 0-{first_atom_id}"),
73+
term_ace.select_atoms("resid 1"),
74+
prot.select_atoms("not name OXT "+chain_select),
75+
term_nme.select_atoms("resid 4")
76+
)
77+
elif len(prot.select_atoms(f"id {last_atom_id}-9999999")) == 0 and len(prot.select_atoms(f"id 0-{first_atom_id}")) == 0:
78+
patched_prot = mda.Merge(
79+
term_ace.select_atoms("resid 1"),
80+
prot.select_atoms("not name OXT "+chain_select),
81+
term_nme.select_atoms("resid 4"),
82+
)
83+
else:
84+
patched_prot = mda.Merge(
85+
prot.select_atoms(f"id 0-{first_atom_id}"),
86+
term_ace.select_atoms("resid 1"),
87+
prot.select_atoms("not name OXT "+chain_select),
88+
term_nme.select_atoms("resid 4"),
89+
prot.select_atoms(f"id {last_atom_id}-9999999")
90+
)
5291

5392
# write output
5493
patched_prot.atoms.write(outpath)
@@ -74,6 +113,8 @@ def sed(path, replace, by):
74113

75114
def fix_residue_numbers(inpath: str, outpath: str, corrected_residue_numbers: list):
76115
"""Modify the residue numbers in a pdb file to reach a target sequence.
116+
This function will also clean up any caps that might have been propagated through
117+
MODELLER, because they will not have proper geometry.
77118
78119
:param inpath: Path to the origin pdb file
79120
:type inpath: str
@@ -86,15 +127,20 @@ def fix_residue_numbers(inpath: str, outpath: str, corrected_residue_numbers: li
86127
data = f.readlines()
87128

88129
dataout = []
130+
dropped_residue_positions = []
89131

90132
previous_resnum = -100
91133
counter = -1
92134
for line in data:
135+
# drop caps here
136+
if "ACE" in line or "NME" in line:
137+
continue
93138
if line[:4] == "ATOM":
94139
res_num = int(line[22:26])
95140
if res_num != previous_resnum:
96141
previous_resnum = res_num
97142
counter += 1
143+
98144
dataout.append(
99145
line[:22]
100146
+ str(corrected_residue_numbers[counter]).rjust(4, " ")

PyMEMENTO/pymemento.py

Lines changed: 88 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -135,19 +135,35 @@ def __init__(
135135

136136
if multiple_chains != None:
137137
self.universe_start.add_TopologyAttr("chainID")
138-
138+
offset = 0
139139
for r, res in enumerate(
140140
self.universe_start.select_atoms("protein").residues
141141
):
142-
for atom in res.atoms:
143-
atom.chainID = multiple_chains[r]
142+
if res.atoms[0].resname == "ACE" or res.atoms[0].resname == "NME":
143+
for atom in res.atoms:
144+
atom.chainID = 'X'
145+
offset += 1
146+
else:
147+
for atom in res.atoms:
148+
atom.chainID = multiple_chains[r-offset]
144149

145150
self.universe_target.add_TopologyAttr("chainID")
151+
offset = 0
146152
for r, res in enumerate(
147153
self.universe_target.select_atoms("protein").residues
148154
):
149-
for atom in res.atoms:
150-
atom.chainID = multiple_chains[r]
155+
if res.atoms[0].resname == "ACE" or res.atoms[0].resname == "NME":
156+
for atom in res.atoms:
157+
atom.chainID = 'X'
158+
offset += 1
159+
continue
160+
else:
161+
for atom in res.atoms:
162+
atom.chainID = multiple_chains[r-offset]
163+
164+
# now remove the X chains from the universe
165+
self.universe_start = mda.Merge(self.universe_start.select_atoms("not chainid X"))
166+
self.universe_target = mda.Merge(self.universe_target.select_atoms("not chainid X"))
151167

152168
# Create working directory if necessary
153169
os.makedirs(working_dir, exist_ok=True)
@@ -220,7 +236,7 @@ def morph(
220236
else:
221237
self.universe_target.atoms.write(join(local_path, "target.pdb"))
222238

223-
# Interpolate coordinates, (1-l)* original + l*(target-original)
239+
# Interpolate coordinates, (1-l)* original + l*(target)
224240
for n, l in enumerate(np.linspace(0, 1, number_of_intermediates)):
225241
intermediate_universe = self.universe_start.copy()
226242

@@ -275,6 +291,7 @@ def make_models(
275291
include_residues=None,
276292
poolsize=12,
277293
mutagenesis=None,
294+
disulphide_patches=None,
278295
):
279296
"""Use the modeller package to generate fixed models based on the morphs already
280297
present in the folder structure. Caps are removed at this stage. Mutagenesis can be
@@ -288,6 +305,10 @@ def make_models(
288305
:type poolsize: int, optional
289306
:param mutagenesis: List of tuples of the form (residue_number, original_residue, new_residue), eg. (10, 'SER', 'ALA') for S10A mutation. \
290307
This is not supported for multichain proteins yet. Defaults to None
308+
:type mutagenesis: list<tuple<int, str, str>>, optional
309+
:param disulphide_patches: List of tuples of the form (residue_number1, chain_id1, residue_number2, chain_id2), eg. (10, A, 20, B) for a disulphide bridge between C10:A and C20:B. \
310+
Defaults to None
311+
:type disulphide_patches: list<tuple<int, str, int, str>>, optional
291312
"""
292313

293314
if not self.morph_done:
@@ -345,7 +366,7 @@ def make_models(
345366
# The normal with statement doesn't work with pytest-cov
346367
pool = multiprocessing.Pool(poolsize)
347368
pool.starmap(
348-
run_modeller, [(frame, number_of_models) for frame in frame_paths]
369+
run_modeller, [(frame, number_of_models, disulphide_patches) for frame in frame_paths]
349370
)
350371
pool.close()
351372
pool.join()
@@ -425,13 +446,6 @@ def process_models(
425446
:type asp_protonation_states: list, optional
426447
"""
427448

428-
# Currently multichain is not supported with caps
429-
430-
if caps and self.multiple_chains:
431-
raise RuntimeError(
432-
"Currently, combining caps and multichain proteins isn't supported."
433-
)
434-
435449
if not self.pathfinding_done:
436450
raise RuntimeError(
437451
"Need to search for the best path before processing models."
@@ -492,25 +506,67 @@ def process_models(
492506
else join(ref_folder, "termini_ref.pdb")
493507
)
494508
for n in range(self.number_of_intermediates):
495-
cap_termini(
496-
join(local_path, file_root + f"{n}.pdb"),
497-
join(local_path, file_root + f"capped{n}.pdb"),
498-
ref_file,
499-
self.residue_numbers[0],
500-
self.residue_numbers[-1],
501-
)
509+
if not self.multiple_chains:
510+
cap_termini(
511+
join(local_path, file_root + f"{n}.pdb"),
512+
join(local_path, file_root + f"capped{n}.pdb"),
513+
ref_file,
514+
self.residue_numbers[0],
515+
self.residue_numbers[-1],
516+
)
517+
518+
# Rename the cap residues to fit the protein naming convention
519+
sed(
520+
join(local_path, file_root + f"capped{n}.pdb"),
521+
"ACE X 1",
522+
"ACE A" + str(self.residue_numbers[0] - 1).rjust(4, " "),
523+
)
524+
sed(
525+
join(local_path, file_root + f"capped{n}.pdb"),
526+
"NME X 4",
527+
"NME A" + str(self.residue_numbers[-1] + 1).rjust(4, " "),
528+
)
529+
else:
530+
# get unique chain ids from multiple_chains list in order
531+
532+
unique_chain_ids = []
533+
for chain_id in self.multiple_chains:
534+
if chain_id not in unique_chain_ids:
535+
unique_chain_ids.append(chain_id)
536+
537+
last_filename = join(local_path, file_root + f"{n}.pdb")
538+
for c, chain_id in enumerate(unique_chain_ids):
539+
# get the residue numbers that have this chain id
540+
chain_residue_numbers = [
541+
self.residue_numbers[i]
542+
for i in range(len(self.residue_numbers))
543+
if self.multiple_chains[i] == chain_id
544+
]
545+
print(chain_residue_numbers)
546+
547+
# cap the termini for this chain, save to a temporary file
548+
cap_termini(
549+
last_filename,
550+
join(local_path, file_root + f"capped{n}.pdb"),
551+
ref_file,
552+
chain_residue_numbers[0],
553+
chain_residue_numbers[-1],
554+
cap_chain=chain_id
555+
)
556+
last_filename = join(local_path, file_root + f"capped{n}.pdb")
557+
558+
# Rename the cap residues to fit the protein naming convention
559+
sed(
560+
join(local_path, file_root + f"capped{n}.pdb"),
561+
"ACE X 1",
562+
f"ACE {chain_id}" + str(chain_residue_numbers[0] - 1).rjust(4, " "),
563+
)
564+
sed(
565+
join(local_path, file_root + f"capped{n}.pdb"),
566+
"NME X 4",
567+
f"NME {chain_id}" + str(chain_residue_numbers[-1] + 1).rjust(4, " "),
568+
)
502569

503-
# Rename the cap residues to fit the protein naming convention
504-
sed(
505-
join(local_path, file_root + f"capped{n}.pdb"),
506-
"ACE X 1",
507-
"ACE A" + str(self.residue_numbers[0] - 1).rjust(4, " "),
508-
)
509-
sed(
510-
join(local_path, file_root + f"capped{n}.pdb"),
511-
"NME X 4",
512-
"NME A" + str(self.residue_numbers[-1] + 1).rjust(4, " "),
513-
)
514570
file_root = "framecapped"
515571

516572
# Copy over the forcefield to our root directory for gmx to use

0 commit comments

Comments
 (0)