Merge pull request #85 from davidkastner/update-bfactor-scripts

davidkastner · web-flow · commit cbd944d61aa7 · 2025-09-27T18:42:46.000-04:00
Updated bfactor scripts
diff --git a/pyqmmm/cli.py b/pyqmmm/cli.py
@@ -43,14 +43,18 @@ def cli():
 @click.option("--xyz2pdb", "-x2p", is_flag=True, help="Converts an xyz file or traj to a PDB.")
 @click.option("--repo2markdown", "-r2m", is_flag=True, help="Converts python package to markdown file.")
 @click.option("--submit_clustering", "-sc", is_flag=True, help="Submits clustering jobs to queue.")
+@click.option("--bfactor_chg", "-bchg", is_flag=True, help="Transfer charge to bfactor.")
+@click.option("--bfactor_csv", "-bcsv", is_flag=True, help="Transfer value to bfactor from csv.")
 def io(
     ppm2png,
     delete_xyz_atoms,
     delete_pdb_atoms,
     translate_pdb_to_center,
     xyz2pdb,
     repo2markdown,
-    submit_clustering
+    submit_clustering,
+    bfactor_chg,
+    bfactor_csv,
     ):
     """
     Tools for useful manipulations of common file types.
@@ -130,6 +134,18 @@ def io(
         import pyqmmm.io.submit_clustering
         pyqmmm.io.submit_clustering.main()
 
+    elif bfactor_chg:
+        click.echo("Transfers charges from chg file to bfactor")
+        click.echo("Loading...")
+        import pyqmmm.io.bfactor_chg
+        pyqmmm.io.bfactor_chg.main()
+
+    elif bfactor_csv:
+        click.echo("Transfers value from csv file to bfactor")
+        click.echo("Loading...")
+        import pyqmmm.io.bfactor_csv
+        pyqmmm.io.bfactor_csv.main()
+
 
 @cli.command()
 @click.option("--gbsa_submit", "-gs", is_flag=True, help="Prepares and submits a mmGBSA job.")
diff --git a/pyqmmm/io/bfactor_chg.py b/pyqmmm/io/bfactor_chg.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+"""
+Replace the B-factor column of a PDB with residue-summed charges from a .chg file.
+
+Behavior:
+- Prompts for PDB and CHG filenames.
+- Verifies:
+    1) The total number of non-empty lines in PDB equals the number of data lines in CHG.
+    2) The PDB contains NO 'TER' lines (will abort if found).
+    3) Every non-empty PDB line is ATOM or HETATM (since we expect 1:1 with CHG lines).
+- Sums charges per residue (chainID, resSeq, iCode) and writes that sum to the B-factor
+  of every atom in the residue.
+- Writes output as: <pdb_stem>_bfactor_charge.pdb (e.g., B401_B402_bfactor_charge.pdb)
+
+Notes on .chg:
+- Similar to XYZ but no header. We read the last whitespace-separated token as charge.
+- The number/order of lines must exactly match the PDB atom order.
+"""
+
+import os
+import sys
+from collections import defaultdict
+
+def ask_path(prompt, default):
+    s = input(f"{prompt} [{default}]: ").strip()
+    return s or default
+
+def die(msg, code=1):
+    print(f"ERROR: {msg}", file=sys.stderr)
+    sys.exit(code)
+
+def is_atom_record(line: str) -> bool:
+    rec = line[:6]
+    return rec == "ATOM  " or rec == "HETATM"
+
+def residue_key(line: str):
+    """Key residues by (chain_id, resseq, icode)."""
+    chain_id = line[21].strip()
+    resseq   = line[22:26].strip()
+    icode    = line[26].strip()
+    return (chain_id, resseq, icode)
+
+def set_bfactor(line: str, b: float) -> str:
+    """
+    Return a copy of the PDB ATOM/HETATM line with B-factor (cols 61-66, 1-based)
+    set to b, preserving occupancy (cols 55-60).
+    """
+    # Ensure at least up to col 66 exists
+    if line.endswith("\n"):
+        core, nl = line[:-1], "\n"
+    else:
+        core, nl = line, ""
+    if len(core) < 66:
+        core = core + " " * (66 - len(core))
+    bstr = f"{b:6.3f}"
+    # occupancy is cols 55-60 (0-based 54:60), B-factor 61-66 (0-based 60:66)
+    out = core[:60] + bstr + core[66:] + nl
+    return out
+
+def read_charges_from_chg(path: str):
+    charges = []
+    with open(path, "r") as f:
+        for raw in f:
+            line = raw.strip()
+            if not line or line.startswith("#"):
+                continue
+            parts = line.split()
+            try:
+                charges.append(float(parts[-1]))
+            except Exception as e:
+                die(f"Failed to parse charge from line:\n{raw}\n{e}")
+    if not charges:
+        die("No charge lines found in .chg file.")
+    return charges
+
+def count_nonempty_lines(path: str):
+    n = 0
+    with open(path, "r") as f:
+        for raw in f:
+            if raw.strip():
+                n += 1
+    return n
+
+def main():
+    # ---- Prompt for filenames (with sensible defaults) ----
+    default_pdb = "B401_B402.pdb"
+    default_chg = "B401_B402.chg"
+    pdb_in = ask_path("PDB file", default_pdb)
+    chg_in = ask_path(".chg file", default_chg)
+
+    if not os.path.isfile(pdb_in):
+        die(f"PDB file not found: {pdb_in}")
+    if not os.path.isfile(chg_in):
+        die(f".chg file not found: {chg_in}")
+
+    # ---- Quick line-count parity check (exact match expected) ----
+    pdb_nonempty = count_nonempty_lines(pdb_in)
+    chg_nonempty = count_nonempty_lines(chg_in)
+    if pdb_nonempty != chg_nonempty:
+        die(f"Line count mismatch: PDB has {pdb_nonempty} non-empty lines, "
+            f"but CHG has {chg_nonempty}. These must be identical.")
+
+    # ---- Load files ----
+    with open(pdb_in, "r") as f:
+        pdb_lines = [ln for ln in f]
+
+    # ---- Disallow TER lines explicitly ----
+    if any(ln.startswith("TER") for ln in pdb_lines):
+        die("Found 'TER' lines in the PDB. "
+            "This script expects a PDB with only ATOM/HETATM records and no TER.")
+
+    # ---- Ensure every non-empty PDB line is ATOM/HETATM ----
+    for ln in pdb_lines:
+        if ln.strip() and not is_atom_record(ln):
+            die("Found a non-ATOM/HETATM line in the PDB. "
+                "For 1:1 mapping with .chg, the PDB must contain only ATOM/HETATM lines.")
+
+    # ---- Read charges (must match line-for-line with PDB atoms) ----
+    charges = read_charges_from_chg(chg_in)
+    atom_count = sum(1 for ln in pdb_lines if ln.strip())
+    if atom_count != len(charges):
+        die(f"Atom/charge count mismatch after filtering: "
+            f"PDB atoms={atom_count}, CHG charges={len(charges)}.")
+
+    # ---- Build residue keys in order ----
+    residue_keys = [residue_key(ln) for ln in pdb_lines if ln.strip()]
+
+    # ---- Sum charges per residue ----
+    res_sums = defaultdict(float)
+    for q, rkey in zip(charges, residue_keys):
+        res_sums[rkey] += q
+
+    # ---- Create output lines with updated B-factors (no REMARK insertion) ----
+    out_lines = []
+    idx = 0
+    for ln in pdb_lines:
+        if ln.strip():  # ATOM/HETATM by earlier check
+            rkey = residue_keys[idx]
+            bval = res_sums[rkey]
+            out_lines.append(set_bfactor(ln, bval))
+            idx += 1
+        else:
+            out_lines.append(ln)  # preserve blank lines if any (shouldn't be)
+    assert idx == len(charges)
+
+    # ---- Derive output filename ----
+    root, ext = os.path.splitext(pdb_in)
+    pdb_out = f"{root}_bfactor_charge.pdb"
+
+    with open(pdb_out, "w") as f:
+        f.writelines(out_lines)
+
+    # ---- Summary ----
+    total_q = sum(charges)
+    print(f"Wrote: {pdb_out}")
+    print(f"Atoms processed: {atom_count}")
+    print(f"Residues found:  {len(res_sums)}")
+    print(f"Sum of CHG charges:            {total_q:.6f}")
+    print(f"Sum of residue-summed charges: {sum(res_sums.values()):.6f}")
+
+if __name__ == "__main__":
+    main()
diff --git a/pyqmmm/io/bfactor_csv.py b/pyqmmm/io/bfactor_csv.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+Transfer a per-residue numeric column from a CSV into the B-factor column of a PDB.
+
+- Prompts for CSV path, PDB path, and the CSV column to transfer.
+- CSV must identify residues by columns (case-insensitive):
+    * Chain   -> one of: ["chain", "Chain", "CHAIN"] (optional; blank if absent)
+    * ResSeq  -> one of: ["resseq","resseqid","resnum","res_seq","residue","Residue","resid","ResID","ResSeq"] (required)
+    * ICode   -> one of: ["icode","inscode","insertion","iCode","ICode"] (optional; blank if absent)
+- Column chosen must be numeric; duplicates per residue key are not allowed.
+- Every residue present in the PDB must be present exactly once in the CSV mapping.
+- Non-ATOM/HETATM (e.g., HEADER, TER, REMARK) are preserved as-is.
+- Output: <pdb_stem>_bfactor_from_csv_<col>.pdb
+"""
+
+import os
+import re
+import sys
+import pandas as pd
+from typing import Tuple, Optional
+
+# ---------------------------- Helpers ---------------------------- #
+
+def die(msg: str, code: int = 1):
+    print(f"ERROR: {msg}", file=sys.stderr)
+    sys.exit(code)
+
+def ask_path(prompt: str, default: Optional[str] = None) -> str:
+    if default:
+        s = input(f"{prompt} [{default}]: ").strip()
+        return s or default
+    return input(f"{prompt}: ").strip()
+
+def is_atom_record(line: str) -> bool:
+    rec = line[:6]
+    return rec == "ATOM  " or rec == "HETATM"
+
+def residue_key_from_pdb_line(line: str) -> Tuple[str, str, str]:
+    """Return (chain, resseq, icode) from ATOM/HETATM line."""
+    chain = line[21].strip()
+    resseq = line[22:26].strip()  # keep as string (handles e.g., '401')
+    icode = line[26].strip()      # insertion code
+    return (chain, resseq, icode)
+
+def format_bfactor(line: str, b: float) -> str:
+    """Overwrite B-factor (cols 61-66, 1-based) with {:6.3f}, preserve occupancy."""
+    nl = "\n" if line.endswith("\n") else ""
+    core = line[:-1] if nl else line
+    if len(core) < 66:
+        core = core + " " * (66 - len(core))
+    return core[:60] + f"{b:6.3f}" + core[66:] + nl
+
+def find_column(df: pd.DataFrame, candidates) -> Optional[str]:
+    for c in candidates:
+        if c in df.columns:
+            return c
+    # case-insensitive
+    lower_map = {c.lower(): c for c in df.columns}
+    for c in candidates:
+        if c.lower() in lower_map:
+            return lower_map[c.lower()]
+    return None
+
+def sanitize_for_filename(s: str) -> str:
+    return re.sub(r"[^A-Za-z0-9._-]+", "_", s).strip("_")
+
+# ---------------------------- Main ---------------------------- #
+
+def main():
+    # Prompt for inputs
+    csv_path = ask_path("CSV file", "values.csv")
+    pdb_path = ask_path("PDB file", "structure.pdb")
+
+    if not os.path.isfile(csv_path):
+        die(f"CSV not found: {csv_path}")
+    if not os.path.isfile(pdb_path):
+        die(f"PDB not found: {pdb_path}")
+
+    # Read CSV
+    try:
+        df = pd.read_csv(csv_path)
+    except Exception as e:
+        die(f"Failed to read CSV: {e}")
+
+    # Identify residue key columns
+    chain_col = find_column(df, ["chain", "Chain", "CHAIN"])
+    resseq_col = find_column(df, ["resseq","resseqid","resnum","res_seq","residue","Residue","resid","ResID","ResSeq"])
+    icode_col = find_column(df, ["icode","inscode","insertion","iCode","ICode"])
+
+    if resseq_col is None:
+        die("Could not find a residue sequence column in CSV. "
+            "Expected one of: ResSeq, resseq, resnum, residue, resid, etc.")
+
+    # Normalize key columns
+    df_keys = pd.DataFrame()
+    df_keys["Chain"] = df[chain_col].fillna("").astype(str) if chain_col else ""
+    df_keys["ResSeq"] = df[resseq_col].astype(str).str.strip()
+    df_keys["ICode"] = df[icode_col].fillna("").astype(str).str.strip() if icode_col else ""
+
+    # Choose value column (prompt)
+    numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
+    if not numeric_cols:
+        die("CSV has no numeric columns to transfer.")
+    print("\nNumeric columns available to transfer:")
+    for i, c in enumerate(numeric_cols, 1):
+        print(f"  {i}. {c}")
+    col_name = input("Enter the exact column name to transfer (or number): ").strip()
+    if col_name.isdigit():
+        idx = int(col_name) - 1
+        if idx < 0 or idx >= len(numeric_cols):
+            die("Invalid column selection.")
+        value_col = numeric_cols[idx]
+    else:
+        if col_name not in df.columns:
+            die(f"Column '{col_name}' not found in CSV.")
+        if not pd.api.types.is_numeric_dtype(df[col_name]):
+            die(f"Column '{col_name}' is not numeric.")
+        value_col = col_name
+
+    # Build residue->value map; require uniqueness & non-NA
+    df_map = pd.concat([df_keys, df[[value_col]]], axis=1)
+    if df_map[value_col].isna().any():
+        bad = df_map[df_map[value_col].isna()][["Chain","ResSeq","ICode"]].head(10).to_dict("records")
+        die(f"Selected column has NA values; first 10 offending residue keys: {bad}")
+
+    # Enforce unique rows per residue key
+    dup_mask = df_map.duplicated(subset=["Chain","ResSeq","ICode"], keep=False)
+    if dup_mask.any():
+        dups = df_map.loc[dup_mask, ["Chain","ResSeq","ICode"]].value_counts().head(10)
+        die("CSV contains duplicate rows for the same residue key. "
+            f"First 10 duplicates (key -> count):\n{dups}")
+
+    # Convert to dict
+    value_by_key = {
+        (row.Chain, row.ResSeq, row.ICode): float(row[value_col])
+        for row in df_map.itertuples(index=False)
+    }
+
+    # Read PDB & collect residue keys
+    with open(pdb_path, "r") as f:
+        pdb_lines = f.readlines()
+
+    atom_keys = []
+    atom_idx = []
+    for i, ln in enumerate(pdb_lines):
+        if is_atom_record(ln):
+            atom_idx.append(i)
+            atom_keys.append(residue_key_from_pdb_line(ln))
+
+    if not atom_idx:
+        die("No ATOM/HETATM records found in PDB.")
+
+    # Compute set of residues present in the PDB
+    pdb_residues = sorted(set(atom_keys))
+
+    # Check coverage: every PDB residue must be in CSV mapping
+    missing = [k for k in pdb_residues if k not in value_by_key]
+    extra = [k for k in value_by_key.keys() if k not in set(pdb_residues)]
+
+    if extra:
+        print(f"NOTE: {len(extra)} CSV residue keys not present in PDB (they will be ignored). "
+              f"Example: {extra[:5]}")
+    if missing:
+        preview = missing[:10]
+        die(f"CSV does not cover all PDB residues. Missing {len(missing)} residues. "
+            f"First 10 missing keys: {preview}")
+
+    # Write output with updated B-factors
+    out_lines = list(pdb_lines)
+    for i, rk in zip(atom_idx, atom_keys):
+        bval = value_by_key[rk]
+        out_lines[i] = format_bfactor(out_lines[i], bval)
+
+    stem, _ = os.path.splitext(pdb_path)
+    out_col = sanitize_for_filename(value_col)
+    out_path = f"{stem}_bfactor_from_csv_{out_col}.pdb"
+    with open(out_path, "w") as f:
+        f.writelines(out_lines)
+
+    # Summary
+    print("\nSuccess.")
+    print(f"  Wrote: {out_path}")
+    print(f"  ATOM/HETATM updated: {len(atom_idx)}")
+    print(f"  Residues in PDB:     {len(pdb_residues)}")
+    print(f"  CSV residues used:    {len(value_by_key)} (extras ignored: {len(extra)})")
+    print(f"  Column transferred:   {value_col}")
+
+if __name__ == "__main__":
+    main()
diff --git a/pyqmmm/md/bfactor_adder.py b/pyqmmm/md/bfactor_adder.py