Skip to content

Commit cbd944d

Browse files
authored
Merge pull request #85 from davidkastner/update-bfactor-scripts
Updated bfactor scripts
2 parents 3b0d6f5 + 6e5233a commit cbd944d

File tree

4 files changed

+368
-67
lines changed

4 files changed

+368
-67
lines changed

pyqmmm/cli.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,18 @@ def cli():
4343
@click.option("--xyz2pdb", "-x2p", is_flag=True, help="Converts an xyz file or traj to a PDB.")
4444
@click.option("--repo2markdown", "-r2m", is_flag=True, help="Converts python package to markdown file.")
4545
@click.option("--submit_clustering", "-sc", is_flag=True, help="Submits clustering jobs to queue.")
46+
@click.option("--bfactor_chg", "-bchg", is_flag=True, help="Transfer charge to bfactor.")
47+
@click.option("--bfactor_csv", "-bcsv", is_flag=True, help="Transfer value to bfactor from csv.")
4648
def io(
4749
ppm2png,
4850
delete_xyz_atoms,
4951
delete_pdb_atoms,
5052
translate_pdb_to_center,
5153
xyz2pdb,
5254
repo2markdown,
53-
submit_clustering
55+
submit_clustering,
56+
bfactor_chg,
57+
bfactor_csv,
5458
):
5559
"""
5660
Tools for useful manipulations of common file types.
@@ -130,6 +134,18 @@ def io(
130134
import pyqmmm.io.submit_clustering
131135
pyqmmm.io.submit_clustering.main()
132136

137+
elif bfactor_chg:
138+
click.echo("Transfers charges from chg file to bfactor")
139+
click.echo("Loading...")
140+
import pyqmmm.io.bfactor_chg
141+
pyqmmm.io.bfactor_chg.main()
142+
143+
elif bfactor_csv:
144+
click.echo("Transfers value from csv file to bfactor")
145+
click.echo("Loading...")
146+
import pyqmmm.io.bfactor_csv
147+
pyqmmm.io.bfactor_csv.main()
148+
133149

134150
@cli.command()
135151
@click.option("--gbsa_submit", "-gs", is_flag=True, help="Prepares and submits a mmGBSA job.")

pyqmmm/io/bfactor_chg.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Replace the B-factor column of a PDB with residue-summed charges from a .chg file.
4+
5+
Behavior:
6+
- Prompts for PDB and CHG filenames.
7+
- Verifies:
8+
1) The total number of non-empty lines in PDB equals the number of data lines in CHG.
9+
2) The PDB contains NO 'TER' lines (will abort if found).
10+
3) Every non-empty PDB line is ATOM or HETATM (since we expect 1:1 with CHG lines).
11+
- Sums charges per residue (chainID, resSeq, iCode) and writes that sum to the B-factor
12+
of every atom in the residue.
13+
- Writes output as: <pdb_stem>_bfactor_charge.pdb (e.g., B401_B402_bfactor_charge.pdb)
14+
15+
Notes on .chg:
16+
- Similar to XYZ but no header. We read the last whitespace-separated token as charge.
17+
- The number/order of lines must exactly match the PDB atom order.
18+
"""
19+
20+
import os
21+
import sys
22+
from collections import defaultdict
23+
24+
def ask_path(prompt, default):
25+
s = input(f"{prompt} [{default}]: ").strip()
26+
return s or default
27+
28+
def die(msg, code=1):
29+
print(f"ERROR: {msg}", file=sys.stderr)
30+
sys.exit(code)
31+
32+
def is_atom_record(line: str) -> bool:
33+
rec = line[:6]
34+
return rec == "ATOM " or rec == "HETATM"
35+
36+
def residue_key(line: str):
37+
"""Key residues by (chain_id, resseq, icode)."""
38+
chain_id = line[21].strip()
39+
resseq = line[22:26].strip()
40+
icode = line[26].strip()
41+
return (chain_id, resseq, icode)
42+
43+
def set_bfactor(line: str, b: float) -> str:
44+
"""
45+
Return a copy of the PDB ATOM/HETATM line with B-factor (cols 61-66, 1-based)
46+
set to b, preserving occupancy (cols 55-60).
47+
"""
48+
# Ensure at least up to col 66 exists
49+
if line.endswith("\n"):
50+
core, nl = line[:-1], "\n"
51+
else:
52+
core, nl = line, ""
53+
if len(core) < 66:
54+
core = core + " " * (66 - len(core))
55+
bstr = f"{b:6.3f}"
56+
# occupancy is cols 55-60 (0-based 54:60), B-factor 61-66 (0-based 60:66)
57+
out = core[:60] + bstr + core[66:] + nl
58+
return out
59+
60+
def read_charges_from_chg(path: str):
61+
charges = []
62+
with open(path, "r") as f:
63+
for raw in f:
64+
line = raw.strip()
65+
if not line or line.startswith("#"):
66+
continue
67+
parts = line.split()
68+
try:
69+
charges.append(float(parts[-1]))
70+
except Exception as e:
71+
die(f"Failed to parse charge from line:\n{raw}\n{e}")
72+
if not charges:
73+
die("No charge lines found in .chg file.")
74+
return charges
75+
76+
def count_nonempty_lines(path: str):
77+
n = 0
78+
with open(path, "r") as f:
79+
for raw in f:
80+
if raw.strip():
81+
n += 1
82+
return n
83+
84+
def main():
85+
# ---- Prompt for filenames (with sensible defaults) ----
86+
default_pdb = "B401_B402.pdb"
87+
default_chg = "B401_B402.chg"
88+
pdb_in = ask_path("PDB file", default_pdb)
89+
chg_in = ask_path(".chg file", default_chg)
90+
91+
if not os.path.isfile(pdb_in):
92+
die(f"PDB file not found: {pdb_in}")
93+
if not os.path.isfile(chg_in):
94+
die(f".chg file not found: {chg_in}")
95+
96+
# ---- Quick line-count parity check (exact match expected) ----
97+
pdb_nonempty = count_nonempty_lines(pdb_in)
98+
chg_nonempty = count_nonempty_lines(chg_in)
99+
if pdb_nonempty != chg_nonempty:
100+
die(f"Line count mismatch: PDB has {pdb_nonempty} non-empty lines, "
101+
f"but CHG has {chg_nonempty}. These must be identical.")
102+
103+
# ---- Load files ----
104+
with open(pdb_in, "r") as f:
105+
pdb_lines = [ln for ln in f]
106+
107+
# ---- Disallow TER lines explicitly ----
108+
if any(ln.startswith("TER") for ln in pdb_lines):
109+
die("Found 'TER' lines in the PDB. "
110+
"This script expects a PDB with only ATOM/HETATM records and no TER.")
111+
112+
# ---- Ensure every non-empty PDB line is ATOM/HETATM ----
113+
for ln in pdb_lines:
114+
if ln.strip() and not is_atom_record(ln):
115+
die("Found a non-ATOM/HETATM line in the PDB. "
116+
"For 1:1 mapping with .chg, the PDB must contain only ATOM/HETATM lines.")
117+
118+
# ---- Read charges (must match line-for-line with PDB atoms) ----
119+
charges = read_charges_from_chg(chg_in)
120+
atom_count = sum(1 for ln in pdb_lines if ln.strip())
121+
if atom_count != len(charges):
122+
die(f"Atom/charge count mismatch after filtering: "
123+
f"PDB atoms={atom_count}, CHG charges={len(charges)}.")
124+
125+
# ---- Build residue keys in order ----
126+
residue_keys = [residue_key(ln) for ln in pdb_lines if ln.strip()]
127+
128+
# ---- Sum charges per residue ----
129+
res_sums = defaultdict(float)
130+
for q, rkey in zip(charges, residue_keys):
131+
res_sums[rkey] += q
132+
133+
# ---- Create output lines with updated B-factors (no REMARK insertion) ----
134+
out_lines = []
135+
idx = 0
136+
for ln in pdb_lines:
137+
if ln.strip(): # ATOM/HETATM by earlier check
138+
rkey = residue_keys[idx]
139+
bval = res_sums[rkey]
140+
out_lines.append(set_bfactor(ln, bval))
141+
idx += 1
142+
else:
143+
out_lines.append(ln) # preserve blank lines if any (shouldn't be)
144+
assert idx == len(charges)
145+
146+
# ---- Derive output filename ----
147+
root, ext = os.path.splitext(pdb_in)
148+
pdb_out = f"{root}_bfactor_charge.pdb"
149+
150+
with open(pdb_out, "w") as f:
151+
f.writelines(out_lines)
152+
153+
# ---- Summary ----
154+
total_q = sum(charges)
155+
print(f"Wrote: {pdb_out}")
156+
print(f"Atoms processed: {atom_count}")
157+
print(f"Residues found: {len(res_sums)}")
158+
print(f"Sum of CHG charges: {total_q:.6f}")
159+
print(f"Sum of residue-summed charges: {sum(res_sums.values()):.6f}")
160+
161+
if __name__ == "__main__":
162+
main()

pyqmmm/io/bfactor_csv.py

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Transfer a per-residue numeric column from a CSV into the B-factor column of a PDB.
4+
5+
- Prompts for CSV path, PDB path, and the CSV column to transfer.
6+
- CSV must identify residues by columns (case-insensitive):
7+
* Chain -> one of: ["chain", "Chain", "CHAIN"] (optional; blank if absent)
8+
* ResSeq -> one of: ["resseq","resseqid","resnum","res_seq","residue","Residue","resid","ResID","ResSeq"] (required)
9+
* ICode -> one of: ["icode","inscode","insertion","iCode","ICode"] (optional; blank if absent)
10+
- Column chosen must be numeric; duplicates per residue key are not allowed.
11+
- Every residue present in the PDB must be present exactly once in the CSV mapping.
12+
- Non-ATOM/HETATM (e.g., HEADER, TER, REMARK) are preserved as-is.
13+
- Output: <pdb_stem>_bfactor_from_csv_<col>.pdb
14+
"""
15+
16+
import os
17+
import re
18+
import sys
19+
import pandas as pd
20+
from typing import Tuple, Optional
21+
22+
# ---------------------------- Helpers ---------------------------- #
23+
24+
def die(msg: str, code: int = 1):
25+
print(f"ERROR: {msg}", file=sys.stderr)
26+
sys.exit(code)
27+
28+
def ask_path(prompt: str, default: Optional[str] = None) -> str:
29+
if default:
30+
s = input(f"{prompt} [{default}]: ").strip()
31+
return s or default
32+
return input(f"{prompt}: ").strip()
33+
34+
def is_atom_record(line: str) -> bool:
35+
rec = line[:6]
36+
return rec == "ATOM " or rec == "HETATM"
37+
38+
def residue_key_from_pdb_line(line: str) -> Tuple[str, str, str]:
39+
"""Return (chain, resseq, icode) from ATOM/HETATM line."""
40+
chain = line[21].strip()
41+
resseq = line[22:26].strip() # keep as string (handles e.g., '401')
42+
icode = line[26].strip() # insertion code
43+
return (chain, resseq, icode)
44+
45+
def format_bfactor(line: str, b: float) -> str:
46+
"""Overwrite B-factor (cols 61-66, 1-based) with {:6.3f}, preserve occupancy."""
47+
nl = "\n" if line.endswith("\n") else ""
48+
core = line[:-1] if nl else line
49+
if len(core) < 66:
50+
core = core + " " * (66 - len(core))
51+
return core[:60] + f"{b:6.3f}" + core[66:] + nl
52+
53+
def find_column(df: pd.DataFrame, candidates) -> Optional[str]:
54+
for c in candidates:
55+
if c in df.columns:
56+
return c
57+
# case-insensitive
58+
lower_map = {c.lower(): c for c in df.columns}
59+
for c in candidates:
60+
if c.lower() in lower_map:
61+
return lower_map[c.lower()]
62+
return None
63+
64+
def sanitize_for_filename(s: str) -> str:
65+
return re.sub(r"[^A-Za-z0-9._-]+", "_", s).strip("_")
66+
67+
# ---------------------------- Main ---------------------------- #
68+
69+
def main():
70+
# Prompt for inputs
71+
csv_path = ask_path("CSV file", "values.csv")
72+
pdb_path = ask_path("PDB file", "structure.pdb")
73+
74+
if not os.path.isfile(csv_path):
75+
die(f"CSV not found: {csv_path}")
76+
if not os.path.isfile(pdb_path):
77+
die(f"PDB not found: {pdb_path}")
78+
79+
# Read CSV
80+
try:
81+
df = pd.read_csv(csv_path)
82+
except Exception as e:
83+
die(f"Failed to read CSV: {e}")
84+
85+
# Identify residue key columns
86+
chain_col = find_column(df, ["chain", "Chain", "CHAIN"])
87+
resseq_col = find_column(df, ["resseq","resseqid","resnum","res_seq","residue","Residue","resid","ResID","ResSeq"])
88+
icode_col = find_column(df, ["icode","inscode","insertion","iCode","ICode"])
89+
90+
if resseq_col is None:
91+
die("Could not find a residue sequence column in CSV. "
92+
"Expected one of: ResSeq, resseq, resnum, residue, resid, etc.")
93+
94+
# Normalize key columns
95+
df_keys = pd.DataFrame()
96+
df_keys["Chain"] = df[chain_col].fillna("").astype(str) if chain_col else ""
97+
df_keys["ResSeq"] = df[resseq_col].astype(str).str.strip()
98+
df_keys["ICode"] = df[icode_col].fillna("").astype(str).str.strip() if icode_col else ""
99+
100+
# Choose value column (prompt)
101+
numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
102+
if not numeric_cols:
103+
die("CSV has no numeric columns to transfer.")
104+
print("\nNumeric columns available to transfer:")
105+
for i, c in enumerate(numeric_cols, 1):
106+
print(f" {i}. {c}")
107+
col_name = input("Enter the exact column name to transfer (or number): ").strip()
108+
if col_name.isdigit():
109+
idx = int(col_name) - 1
110+
if idx < 0 or idx >= len(numeric_cols):
111+
die("Invalid column selection.")
112+
value_col = numeric_cols[idx]
113+
else:
114+
if col_name not in df.columns:
115+
die(f"Column '{col_name}' not found in CSV.")
116+
if not pd.api.types.is_numeric_dtype(df[col_name]):
117+
die(f"Column '{col_name}' is not numeric.")
118+
value_col = col_name
119+
120+
# Build residue->value map; require uniqueness & non-NA
121+
df_map = pd.concat([df_keys, df[[value_col]]], axis=1)
122+
if df_map[value_col].isna().any():
123+
bad = df_map[df_map[value_col].isna()][["Chain","ResSeq","ICode"]].head(10).to_dict("records")
124+
die(f"Selected column has NA values; first 10 offending residue keys: {bad}")
125+
126+
# Enforce unique rows per residue key
127+
dup_mask = df_map.duplicated(subset=["Chain","ResSeq","ICode"], keep=False)
128+
if dup_mask.any():
129+
dups = df_map.loc[dup_mask, ["Chain","ResSeq","ICode"]].value_counts().head(10)
130+
die("CSV contains duplicate rows for the same residue key. "
131+
f"First 10 duplicates (key -> count):\n{dups}")
132+
133+
# Convert to dict
134+
value_by_key = {
135+
(row.Chain, row.ResSeq, row.ICode): float(row[value_col])
136+
for row in df_map.itertuples(index=False)
137+
}
138+
139+
# Read PDB & collect residue keys
140+
with open(pdb_path, "r") as f:
141+
pdb_lines = f.readlines()
142+
143+
atom_keys = []
144+
atom_idx = []
145+
for i, ln in enumerate(pdb_lines):
146+
if is_atom_record(ln):
147+
atom_idx.append(i)
148+
atom_keys.append(residue_key_from_pdb_line(ln))
149+
150+
if not atom_idx:
151+
die("No ATOM/HETATM records found in PDB.")
152+
153+
# Compute set of residues present in the PDB
154+
pdb_residues = sorted(set(atom_keys))
155+
156+
# Check coverage: every PDB residue must be in CSV mapping
157+
missing = [k for k in pdb_residues if k not in value_by_key]
158+
extra = [k for k in value_by_key.keys() if k not in set(pdb_residues)]
159+
160+
if extra:
161+
print(f"NOTE: {len(extra)} CSV residue keys not present in PDB (they will be ignored). "
162+
f"Example: {extra[:5]}")
163+
if missing:
164+
preview = missing[:10]
165+
die(f"CSV does not cover all PDB residues. Missing {len(missing)} residues. "
166+
f"First 10 missing keys: {preview}")
167+
168+
# Write output with updated B-factors
169+
out_lines = list(pdb_lines)
170+
for i, rk in zip(atom_idx, atom_keys):
171+
bval = value_by_key[rk]
172+
out_lines[i] = format_bfactor(out_lines[i], bval)
173+
174+
stem, _ = os.path.splitext(pdb_path)
175+
out_col = sanitize_for_filename(value_col)
176+
out_path = f"{stem}_bfactor_from_csv_{out_col}.pdb"
177+
with open(out_path, "w") as f:
178+
f.writelines(out_lines)
179+
180+
# Summary
181+
print("\nSuccess.")
182+
print(f" Wrote: {out_path}")
183+
print(f" ATOM/HETATM updated: {len(atom_idx)}")
184+
print(f" Residues in PDB: {len(pdb_residues)}")
185+
print(f" CSV residues used: {len(value_by_key)} (extras ignored: {len(extra)})")
186+
print(f" Column transferred: {value_col}")
187+
188+
if __name__ == "__main__":
189+
main()

0 commit comments

Comments
 (0)