Skip to content

Commit 566f153

Browse files
authored
Merge pull request #44 from daichengxin/dev
Support for DIA-NN 2.0
2 parents 1ee7ee8 + f0156c9 commit 566f153

File tree

3 files changed

+133
-99
lines changed

3 files changed

+133
-99
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name = "quantms-utils"
33
description = "Python scripts and helpers for the quantMS workflow"
44
readme = "README.md"
55
license = "MIT"
6-
version = "0.0.17"
6+
version = "0.0.19"
77
authors = [
88
"Yasset Perez-Riverol <[email protected]>",
99
"Dai Chengxin <[email protected]>",

quantmsutils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.18"
1+
__version__ = "0.0.19"

quantmsutils/diann/diann2mztab.py

Lines changed: 131 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@ def diann2mztab(
9090
"Modified.Sequence",
9191
"Precursor.Charge",
9292
"Precursor.Quantity",
93-
"File.Name",
9493
"Run",
9594
]
9695

@@ -101,7 +100,6 @@ def diann2mztab(
101100
"PeptideSequence",
102101
"PrecursorCharge",
103102
"Intensity",
104-
"Reference",
105103
"Run",
106104
]
107105
out_msstats = out_msstats[out_msstats["Intensity"] != 0]
@@ -113,12 +111,6 @@ def diann2mztab(
113111
out_msstats["FragmentIon"] = "NA"
114112
out_msstats["ProductCharge"] = "0"
115113
out_msstats["IsotopeLabelType"] = "L"
116-
unique_reference_map = {k: os.path.basename(k) for k in out_msstats["Reference"].unique()}
117-
out_msstats["Reference"] = out_msstats["Reference"].map(unique_reference_map)
118-
del unique_reference_map
119-
120-
logger.debug("\n\nReference Column >>>")
121-
logger.debug(out_msstats["Reference"])
122114

123115
logger.debug(f"\n\nout_msstats ({out_msstats.shape}) >>>")
124116
logger.debug(out_msstats.head(5))
@@ -325,7 +317,11 @@ def find_first_file_with_suffix(self, suffix: str) -> os.PathLike:
325317

326318
@property
327319
def report(self) -> os.PathLike:
328-
return self.find_first_file_with_suffix("report.tsv")
320+
# DIA-NN 1.8.1 return tsv format, but DIA-NN 2.0 only return parquet
321+
try:
322+
return self.find_first_file_with_suffix("report.tsv")
323+
except FileNotFoundError:
324+
return self.find_first_file_with_suffix("report.parquet")
329325

330326
@property
331327
def pg_matrix(self) -> os.PathLike:
@@ -362,7 +358,7 @@ def diann_version(self) -> str:
362358
return diann_version_id
363359

364360
def validate_diann_version(self) -> None:
365-
supported_diann_versions = ["1.8.1", "1.9.beta.1", "1.9.2"]
361+
supported_diann_versions = ["1.8.1", "2.0", "2.0.1", "2.0.2"]
366362
if self.diann_version not in supported_diann_versions:
367363
raise ValueError(f"Unsupported DIANN version {self.diann_version}")
368364

@@ -418,7 +414,7 @@ def convert_to_mztab(
418414
sep="\t",
419415
header=0,
420416
)
421-
prh = mztab_prh(report, pg, index_ref, database, fasta_df)
417+
prh = mztab_prh(report, pg, index_ref, database, fasta_df, self.diann_version)
422418
del pg
423419
pr = pd.read_csv(
424420
self.pr_matrix,
@@ -443,15 +439,12 @@ def convert_to_mztab(
443439

444440
def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
445441
remain_cols = [
446-
"File.Name",
447442
"Run",
448443
"Protein.Group",
449444
"Protein.Names",
450445
"Protein.Ids",
451-
"First.Protein.Description",
452446
"PG.MaxLFQ",
453447
"RT",
454-
"MS2.Scan",
455448
"Global.Q.Value",
456449
"Lib.Q.Value",
457450
"PEP",
@@ -462,10 +455,13 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
462455
"Stripped.Sequence",
463456
"Precursor.Charge",
464457
"Precursor.Quantity",
465-
"Global.PG.Q.Value",
466-
"MS2.Scan",
458+
"Global.PG.Q.Value"
467459
]
468-
report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols)
460+
if "2.0" not in self.diann_version:
461+
report = pd.read_csv(self.report, sep="\t", header=0,
462+
usecols=remain_cols + ["MS2.Scan"])
463+
else:
464+
report = pd.read_parquet(self.report, columns=remain_cols + ["Decoy"])
469465

470466
# filter based on qvalue parameter for downstream analysiss
471467
logger.debug(
@@ -675,7 +671,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages, diann_vers
675671
return out_mztab_mtd_t, database
676672

677673

678-
def mztab_prh(report, pg, index_ref, database, fasta_df):
674+
def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version):
679675
"""
680676
Construct PRH sub-table.
681677
@@ -689,6 +685,8 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
689685
:type database: str
690686
:param fasta_df: A dataframe contains protein IDs, sequences and lengths
691687
:type fasta_df: pandas.core.frame.DataFrame
688+
:param diann_version: DIA-NN version
689+
:type diann_version: str
692690
:return: PRH sub-table
693691
:rtype: pandas.core.frame.DataFrame
694692
"""
@@ -699,7 +697,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
699697
f" input index_ref shape: {index_ref.shape},"
700698
f" input fasta_df shape: {fasta_df.shape}"
701699
)
702-
file = list(pg.columns[5:])
700+
# DIA-NN 2.0 PG file doesn't have Protein IDs column
701+
if "2.0" not in diann_version:
702+
file = list(pg.columns[5:])
703+
else:
704+
file = list(pg.columns[4:])
703705
col = {}
704706
for i in file:
705707
col[i] = (
@@ -961,6 +963,7 @@ def mztab_peh(
961963
}
962964
name_mapper = name_mapper_builder(subname_mapper)
963965
tmp.rename(columns=name_mapper, inplace=True)
966+
print(tmp.columns)
964967
out_mztab_peh = out_mztab_peh.merge(
965968
tmp.rename(columns={"precursor.Index": "pr_id"}),
966969
on="pr_id",
@@ -1087,98 +1090,129 @@ def __find_info(directory, n):
10871090
].astype(str)
10881091

10891092
# TODO seconds returned from precursor.getRT()
1090-
target.loc[:, "RT"] = target.apply(lambda x: x["RT"] / 60, axis=1)
1093+
# RT column of DIA-NN 2.0.* is float32 type, but RT column of DIA-NN 1.8.* is float64 type
1094+
target.loc[:, "RT"] = target.apply(lambda x: (x["RT"] / 60), axis=1)
1095+
group['RT'] = group['RT'].astype('float64')
10911096

10921097
rt_matched = pd.merge_asof(group, target, on="RT", direction="nearest")
1093-
new_target = target
1094-
new_target.columns = [
1095-
"scan_RT",
1096-
"scan_opt_global_spectrum_reference",
1097-
"MS2.Scan",
1098-
"scan_exp_mass_to_charge",
1099-
]
1100-
scan_matched = pd.merge(rt_matched, new_target, on="MS2.Scan")
1098+
# DIA-NN can't export MS2.Scan column in some versions (eg. DIA-NN 2.0, DIA-NN 2.0.1).
1099+
# So we only match MS2 by RT.
1100+
if "MS2.Scan" in group.columns:
1101+
logger.info("Mapping DIA-NN MS2 to mzML by MS2.Scan and RT columns")
1102+
new_target = target
1103+
new_target.columns = [
1104+
"scan_RT",
1105+
"scan_opt_global_spectrum_reference",
1106+
"MS2.Scan",
1107+
"scan_exp_mass_to_charge",
1108+
]
1109+
scan_matched = pd.merge(rt_matched, new_target, on="MS2.Scan")
11011110

1102-
# Cross validation spectrum ID between scan matched and RT matched
1103-
# Keep Scan matched When RT matched and DIA-NN Scan matched are inconsistent in mzML.
1104-
scan_matched["unassigned_matched"] = scan_matched.apply(
1105-
lambda row: 1 if row["MS2.Scan"] != row["DIANN-intraID"] else 0, axis=1
1106-
)
1107-
if len(scan_matched[scan_matched["unassigned_matched"] == 1]) > 0:
1108-
v_str = scan_matched[scan_matched["unassigned_matched"] == 1]["MS2.Scan"].tolist()
1109-
logger.info(
1110-
f"RT matched and DIA-NN Scan matched are inconsistent in mzML. Keep Scan matched: {v_str}"
1111-
)
1112-
scan_matched.drop(
1113-
[
1114-
"RT",
1115-
"opt_global_spectrum_reference",
1116-
"DIANN-intraID",
1117-
"exp_mass_to_charge",
1118-
"unassigned_matched",
1119-
],
1120-
inplace=True,
1121-
axis=1,
1122-
)
1123-
scan_matched.rename(
1124-
columns={
1125-
"scan_RT": "RT",
1126-
"scan_opt_global_spectrum_reference": "opt_global_spectrum_reference",
1127-
"scan_exp_mass_to_charge": "exp_mass_to_charge",
1128-
},
1129-
inplace=True,
1111+
# Cross validation spectrum ID between scan matched and RT matched
1112+
# Keep Scan matched When RT matched and DIA-NN Scan matched are inconsistent in mzML.
1113+
scan_matched["unassigned_matched"] = scan_matched.apply(
1114+
lambda row: 1 if row["MS2.Scan"] != row["DIANN-intraID"] else 0, axis=1
11301115
)
1116+
if len(scan_matched[scan_matched["unassigned_matched"] == 1]) > 0:
1117+
v_str = scan_matched[scan_matched["unassigned_matched"] == 1]["MS2.Scan"].tolist()
1118+
logger.info(
1119+
f"RT matched and DIA-NN Scan matched are inconsistent in mzML. Keep Scan matched: {v_str}"
1120+
)
1121+
scan_matched.drop(
1122+
[
1123+
"RT",
1124+
"opt_global_spectrum_reference",
1125+
"DIANN-intraID",
1126+
"exp_mass_to_charge",
1127+
"unassigned_matched",
1128+
],
1129+
inplace=True,
1130+
axis=1,
1131+
)
1132+
scan_matched.rename(
1133+
columns={
1134+
"scan_RT": "RT",
1135+
"scan_opt_global_spectrum_reference": "opt_global_spectrum_reference",
1136+
"scan_exp_mass_to_charge": "exp_mass_to_charge",
1137+
},
1138+
inplace=True,
1139+
)
1140+
else:
1141+
scan_matched.drop(
1142+
[
1143+
"scan_RT",
1144+
"scan_opt_global_spectrum_reference",
1145+
"scan_exp_mass_to_charge",
1146+
"unassigned_matched",
1147+
],
1148+
inplace=True,
1149+
axis=1,
1150+
)
1151+
1152+
out_mztab_psh = pd.concat([out_mztab_psh, scan_matched])
11311153
else:
1132-
scan_matched.drop(
1133-
[
1134-
"scan_RT",
1135-
"scan_opt_global_spectrum_reference",
1136-
"scan_exp_mass_to_charge",
1137-
"unassigned_matched",
1138-
],
1139-
inplace=True,
1140-
axis=1,
1141-
)
1142-
1143-
out_mztab_psh = pd.concat([out_mztab_psh, scan_matched])
1154+
logger.info("MS2.Scan column isn't in DIA-NN report, only Matching MS2 by RT values")
1155+
out_mztab_psh = pd.concat([out_mztab_psh, rt_matched])
11441156

11451157
del report
11461158

11471159
# Score at PSM level: Q.Value
1148-
out_mztab_psh = out_mztab_psh[
1149-
[
1150-
"Stripped.Sequence",
1151-
"Protein.Group",
1152-
"Q.Value",
1153-
"RT",
1154-
"Precursor.Charge",
1155-
"Calculate.Precursor.Mz",
1156-
"exp_mass_to_charge",
1157-
"Modified.Sequence",
1158-
"PEP",
1159-
"Global.Q.Value",
1160-
"Global.Q.Value",
1161-
"opt_global_spectrum_reference",
1162-
"ms_run",
1163-
]
1164-
]
1165-
out_mztab_psh.columns = [
1166-
"sequence",
1167-
"accession",
1168-
"search_engine_score[1]",
1169-
"retention_time",
1170-
"charge",
1171-
"calc_mass_to_charge",
1160+
psm_columns = [
1161+
"Stripped.Sequence",
1162+
"Protein.Group",
1163+
"Q.Value",
1164+
"RT",
1165+
"Precursor.Charge",
1166+
"Calculate.Precursor.Mz",
11721167
"exp_mass_to_charge",
1173-
"opt_global_cv_MS:1000889_peptidoform_sequence",
1174-
"opt_global_SpecEValue_score",
1175-
"opt_global_q-value",
1176-
"opt_global_q-value_score",
1168+
"Modified.Sequence",
1169+
"PEP",
1170+
"Global.Q.Value",
1171+
"Global.Q.Value",
11771172
"opt_global_spectrum_reference",
11781173
"ms_run",
11791174
]
11801175

1181-
out_mztab_psh.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0"
1176+
if "Decoy" in out_mztab_psh.columns:
1177+
out_mztab_psh = out_mztab_psh[
1178+
psm_columns + ["Decoy"]
1179+
]
1180+
out_mztab_psh.columns = [
1181+
"sequence",
1182+
"accession",
1183+
"search_engine_score[1]",
1184+
"retention_time",
1185+
"charge",
1186+
"calc_mass_to_charge",
1187+
"exp_mass_to_charge",
1188+
"opt_global_cv_MS:1000889_peptidoform_sequence",
1189+
"opt_global_SpecEValue_score",
1190+
"opt_global_q-value",
1191+
"opt_global_q-value_score",
1192+
"opt_global_spectrum_reference",
1193+
"ms_run",
1194+
"opt_global_cv_MS:1002217_decoy_peptide"
1195+
]
1196+
else:
1197+
out_mztab_psh = out_mztab_psh[
1198+
psm_columns
1199+
]
1200+
out_mztab_psh.columns = [
1201+
"sequence",
1202+
"accession",
1203+
"search_engine_score[1]",
1204+
"retention_time",
1205+
"charge",
1206+
"calc_mass_to_charge",
1207+
"exp_mass_to_charge",
1208+
"opt_global_cv_MS:1000889_peptidoform_sequence",
1209+
"opt_global_SpecEValue_score",
1210+
"opt_global_q-value",
1211+
"opt_global_q-value_score",
1212+
"opt_global_spectrum_reference",
1213+
"ms_run"
1214+
]
1215+
out_mztab_psh.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = 0
11821216
out_mztab_psh.loc[:, "PSM_ID"] = out_mztab_psh.index
11831217
out_mztab_psh.loc[:, "unique"] = out_mztab_psh.apply(
11841218
lambda x: "0" if ";" in str(x["accession"]) else "1",

0 commit comments

Comments
 (0)