@@ -90,7 +90,6 @@ def diann2mztab(
9090 "Modified.Sequence" ,
9191 "Precursor.Charge" ,
9292 "Precursor.Quantity" ,
93- "File.Name" ,
9493 "Run" ,
9594 ]
9695
@@ -101,7 +100,6 @@ def diann2mztab(
101100 "PeptideSequence" ,
102101 "PrecursorCharge" ,
103102 "Intensity" ,
104- "Reference" ,
105103 "Run" ,
106104 ]
107105 out_msstats = out_msstats [out_msstats ["Intensity" ] != 0 ]
@@ -113,12 +111,6 @@ def diann2mztab(
113111 out_msstats ["FragmentIon" ] = "NA"
114112 out_msstats ["ProductCharge" ] = "0"
115113 out_msstats ["IsotopeLabelType" ] = "L"
116- unique_reference_map = {k : os .path .basename (k ) for k in out_msstats ["Reference" ].unique ()}
117- out_msstats ["Reference" ] = out_msstats ["Reference" ].map (unique_reference_map )
118- del unique_reference_map
119-
120- logger .debug ("\n \n Reference Column >>>" )
121- logger .debug (out_msstats ["Reference" ])
122114
123115 logger .debug (f"\n \n out_msstats ({ out_msstats .shape } ) >>>" )
124116 logger .debug (out_msstats .head (5 ))
@@ -325,7 +317,11 @@ def find_first_file_with_suffix(self, suffix: str) -> os.PathLike:
325317
326318 @property
327319 def report (self ) -> os .PathLike :
328- return self .find_first_file_with_suffix ("report.tsv" )
320+ # DIA-NN 1.8.1 return tsv format, but DIA-NN 2.0 only return parquet
321+ try :
322+ return self .find_first_file_with_suffix ("report.tsv" )
323+ except FileNotFoundError :
324+ return self .find_first_file_with_suffix ("report.parquet" )
329325
330326 @property
331327 def pg_matrix (self ) -> os .PathLike :
@@ -362,7 +358,7 @@ def diann_version(self) -> str:
362358 return diann_version_id
363359
364360 def validate_diann_version (self ) -> None :
365- supported_diann_versions = ["1.8.1" , "1.9.beta .1" , "1.9 .2" ]
361+ supported_diann_versions = ["1.8.1" , "2.0" , "2.0 .1" , "2.0 .2" ]
366362 if self .diann_version not in supported_diann_versions :
367363 raise ValueError (f"Unsupported DIANN version { self .diann_version } " )
368364
@@ -418,7 +414,7 @@ def convert_to_mztab(
418414 sep = "\t " ,
419415 header = 0 ,
420416 )
421- prh = mztab_prh (report , pg , index_ref , database , fasta_df )
417+ prh = mztab_prh (report , pg , index_ref , database , fasta_df , self . diann_version )
422418 del pg
423419 pr = pd .read_csv (
424420 self .pr_matrix ,
@@ -443,15 +439,12 @@ def convert_to_mztab(
443439
444440 def main_report_df (self , qvalue_threshold : float ) -> pd .DataFrame :
445441 remain_cols = [
446- "File.Name" ,
447442 "Run" ,
448443 "Protein.Group" ,
449444 "Protein.Names" ,
450445 "Protein.Ids" ,
451- "First.Protein.Description" ,
452446 "PG.MaxLFQ" ,
453447 "RT" ,
454- "MS2.Scan" ,
455448 "Global.Q.Value" ,
456449 "Lib.Q.Value" ,
457450 "PEP" ,
@@ -462,10 +455,13 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
462455 "Stripped.Sequence" ,
463456 "Precursor.Charge" ,
464457 "Precursor.Quantity" ,
465- "Global.PG.Q.Value" ,
466- "MS2.Scan" ,
458+ "Global.PG.Q.Value"
467459 ]
468- report = pd .read_csv (self .report , sep = "\t " , header = 0 , usecols = remain_cols )
460+ if "2.0" not in self .diann_version :
461+ report = pd .read_csv (self .report , sep = "\t " , header = 0 ,
462+ usecols = remain_cols + ["MS2.Scan" ])
463+ else :
464+ report = pd .read_parquet (self .report , columns = remain_cols + ["Decoy" ])
469465
470466 # filter based on qvalue parameter for downstream analysiss
471467 logger .debug (
@@ -675,7 +671,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages, diann_vers
675671 return out_mztab_mtd_t , database
676672
677673
678- def mztab_prh (report , pg , index_ref , database , fasta_df ):
674+ def mztab_prh (report , pg , index_ref , database , fasta_df , diann_version ):
679675 """
680676 Construct PRH sub-table.
681677
@@ -689,6 +685,8 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
689685 :type database: str
690686 :param fasta_df: A dataframe contains protein IDs, sequences and lengths
691687 :type fasta_df: pandas.core.frame.DataFrame
688+ :param diann_version: DIA-NN version
689+ :type diann_version: str
692690 :return: PRH sub-table
693691 :rtype: pandas.core.frame.DataFrame
694692 """
@@ -699,7 +697,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
699697 f" input index_ref shape: { index_ref .shape } ,"
700698 f" input fasta_df shape: { fasta_df .shape } "
701699 )
702- file = list (pg .columns [5 :])
700+ # DIA-NN 2.0 PG file doesn't have Protein IDs column
701+ if "2.0" not in diann_version :
702+ file = list (pg .columns [5 :])
703+ else :
704+ file = list (pg .columns [4 :])
703705 col = {}
704706 for i in file :
705707 col [i ] = (
@@ -961,6 +963,7 @@ def mztab_peh(
961963 }
962964 name_mapper = name_mapper_builder (subname_mapper )
963965 tmp .rename (columns = name_mapper , inplace = True )
966+ print (tmp .columns )
964967 out_mztab_peh = out_mztab_peh .merge (
965968 tmp .rename (columns = {"precursor.Index" : "pr_id" }),
966969 on = "pr_id" ,
@@ -1087,98 +1090,129 @@ def __find_info(directory, n):
10871090 ].astype (str )
10881091
10891092 # TODO seconds returned from precursor.getRT()
1090- target .loc [:, "RT" ] = target .apply (lambda x : x ["RT" ] / 60 , axis = 1 )
1093+ # RT column of DIA-NN 2.0.* is float32 type, but RT column of DIA-NN 1.8.* is float64 type
1094+ target .loc [:, "RT" ] = target .apply (lambda x : (x ["RT" ] / 60 ), axis = 1 )
1095+ group ['RT' ] = group ['RT' ].astype ('float64' )
10911096
10921097 rt_matched = pd .merge_asof (group , target , on = "RT" , direction = "nearest" )
1093- new_target = target
1094- new_target .columns = [
1095- "scan_RT" ,
1096- "scan_opt_global_spectrum_reference" ,
1097- "MS2.Scan" ,
1098- "scan_exp_mass_to_charge" ,
1099- ]
1100- scan_matched = pd .merge (rt_matched , new_target , on = "MS2.Scan" )
1098+ # DIA-NN can't export MS2.Scan column in some versions (eg. DIA-NN 2.0, DIA-NN 2.0.1).
1099+ # So we only match MS2 by RT.
1100+ if "MS2.Scan" in group .columns :
1101+ logger .info ("Mapping DIA-NN MS2 to mzML by MS2.Scan and RT columns" )
1102+ new_target = target
1103+ new_target .columns = [
1104+ "scan_RT" ,
1105+ "scan_opt_global_spectrum_reference" ,
1106+ "MS2.Scan" ,
1107+ "scan_exp_mass_to_charge" ,
1108+ ]
1109+ scan_matched = pd .merge (rt_matched , new_target , on = "MS2.Scan" )
11011110
1102- # Cross validation spectrum ID between scan matched and RT matched
1103- # Keep Scan matched When RT matched and DIA-NN Scan matched are inconsistent in mzML.
1104- scan_matched ["unassigned_matched" ] = scan_matched .apply (
1105- lambda row : 1 if row ["MS2.Scan" ] != row ["DIANN-intraID" ] else 0 , axis = 1
1106- )
1107- if len (scan_matched [scan_matched ["unassigned_matched" ] == 1 ]) > 0 :
1108- v_str = scan_matched [scan_matched ["unassigned_matched" ] == 1 ]["MS2.Scan" ].tolist ()
1109- logger .info (
1110- f"RT matched and DIA-NN Scan matched are inconsistent in mzML. Keep Scan matched: { v_str } "
1111- )
1112- scan_matched .drop (
1113- [
1114- "RT" ,
1115- "opt_global_spectrum_reference" ,
1116- "DIANN-intraID" ,
1117- "exp_mass_to_charge" ,
1118- "unassigned_matched" ,
1119- ],
1120- inplace = True ,
1121- axis = 1 ,
1122- )
1123- scan_matched .rename (
1124- columns = {
1125- "scan_RT" : "RT" ,
1126- "scan_opt_global_spectrum_reference" : "opt_global_spectrum_reference" ,
1127- "scan_exp_mass_to_charge" : "exp_mass_to_charge" ,
1128- },
1129- inplace = True ,
1111+ # Cross validation spectrum ID between scan matched and RT matched
1112+ # Keep Scan matched When RT matched and DIA-NN Scan matched are inconsistent in mzML.
1113+ scan_matched ["unassigned_matched" ] = scan_matched .apply (
1114+ lambda row : 1 if row ["MS2.Scan" ] != row ["DIANN-intraID" ] else 0 , axis = 1
11301115 )
1116+ if len (scan_matched [scan_matched ["unassigned_matched" ] == 1 ]) > 0 :
1117+ v_str = scan_matched [scan_matched ["unassigned_matched" ] == 1 ]["MS2.Scan" ].tolist ()
1118+ logger .info (
1119+ f"RT matched and DIA-NN Scan matched are inconsistent in mzML. Keep Scan matched: { v_str } "
1120+ )
1121+ scan_matched .drop (
1122+ [
1123+ "RT" ,
1124+ "opt_global_spectrum_reference" ,
1125+ "DIANN-intraID" ,
1126+ "exp_mass_to_charge" ,
1127+ "unassigned_matched" ,
1128+ ],
1129+ inplace = True ,
1130+ axis = 1 ,
1131+ )
1132+ scan_matched .rename (
1133+ columns = {
1134+ "scan_RT" : "RT" ,
1135+ "scan_opt_global_spectrum_reference" : "opt_global_spectrum_reference" ,
1136+ "scan_exp_mass_to_charge" : "exp_mass_to_charge" ,
1137+ },
1138+ inplace = True ,
1139+ )
1140+ else :
1141+ scan_matched .drop (
1142+ [
1143+ "scan_RT" ,
1144+ "scan_opt_global_spectrum_reference" ,
1145+ "scan_exp_mass_to_charge" ,
1146+ "unassigned_matched" ,
1147+ ],
1148+ inplace = True ,
1149+ axis = 1 ,
1150+ )
1151+
1152+ out_mztab_psh = pd .concat ([out_mztab_psh , scan_matched ])
11311153 else :
1132- scan_matched .drop (
1133- [
1134- "scan_RT" ,
1135- "scan_opt_global_spectrum_reference" ,
1136- "scan_exp_mass_to_charge" ,
1137- "unassigned_matched" ,
1138- ],
1139- inplace = True ,
1140- axis = 1 ,
1141- )
1142-
1143- out_mztab_psh = pd .concat ([out_mztab_psh , scan_matched ])
1154+ logger .info ("MS2.Scan column isn't in DIA-NN report, only Matching MS2 by RT values" )
1155+ out_mztab_psh = pd .concat ([out_mztab_psh , rt_matched ])
11441156
11451157 del report
11461158
11471159 # Score at PSM level: Q.Value
1148- out_mztab_psh = out_mztab_psh [
1149- [
1150- "Stripped.Sequence" ,
1151- "Protein.Group" ,
1152- "Q.Value" ,
1153- "RT" ,
1154- "Precursor.Charge" ,
1155- "Calculate.Precursor.Mz" ,
1156- "exp_mass_to_charge" ,
1157- "Modified.Sequence" ,
1158- "PEP" ,
1159- "Global.Q.Value" ,
1160- "Global.Q.Value" ,
1161- "opt_global_spectrum_reference" ,
1162- "ms_run" ,
1163- ]
1164- ]
1165- out_mztab_psh .columns = [
1166- "sequence" ,
1167- "accession" ,
1168- "search_engine_score[1]" ,
1169- "retention_time" ,
1170- "charge" ,
1171- "calc_mass_to_charge" ,
1160+ psm_columns = [
1161+ "Stripped.Sequence" ,
1162+ "Protein.Group" ,
1163+ "Q.Value" ,
1164+ "RT" ,
1165+ "Precursor.Charge" ,
1166+ "Calculate.Precursor.Mz" ,
11721167 "exp_mass_to_charge" ,
1173- "opt_global_cv_MS:1000889_peptidoform_sequence " ,
1174- "opt_global_SpecEValue_score " ,
1175- "opt_global_q-value " ,
1176- "opt_global_q-value_score " ,
1168+ "Modified.Sequence " ,
1169+ "PEP " ,
1170+ "Global.Q.Value " ,
1171+ "Global.Q.Value " ,
11771172 "opt_global_spectrum_reference" ,
11781173 "ms_run" ,
11791174 ]
11801175
1181- out_mztab_psh .loc [:, "opt_global_cv_MS:1002217_decoy_peptide" ] = "0"
1176+ if "Decoy" in out_mztab_psh .columns :
1177+ out_mztab_psh = out_mztab_psh [
1178+ psm_columns + ["Decoy" ]
1179+ ]
1180+ out_mztab_psh .columns = [
1181+ "sequence" ,
1182+ "accession" ,
1183+ "search_engine_score[1]" ,
1184+ "retention_time" ,
1185+ "charge" ,
1186+ "calc_mass_to_charge" ,
1187+ "exp_mass_to_charge" ,
1188+ "opt_global_cv_MS:1000889_peptidoform_sequence" ,
1189+ "opt_global_SpecEValue_score" ,
1190+ "opt_global_q-value" ,
1191+ "opt_global_q-value_score" ,
1192+ "opt_global_spectrum_reference" ,
1193+ "ms_run" ,
1194+ "opt_global_cv_MS:1002217_decoy_peptide"
1195+ ]
1196+ else :
1197+ out_mztab_psh = out_mztab_psh [
1198+ psm_columns
1199+ ]
1200+ out_mztab_psh .columns = [
1201+ "sequence" ,
1202+ "accession" ,
1203+ "search_engine_score[1]" ,
1204+ "retention_time" ,
1205+ "charge" ,
1206+ "calc_mass_to_charge" ,
1207+ "exp_mass_to_charge" ,
1208+ "opt_global_cv_MS:1000889_peptidoform_sequence" ,
1209+ "opt_global_SpecEValue_score" ,
1210+ "opt_global_q-value" ,
1211+ "opt_global_q-value_score" ,
1212+ "opt_global_spectrum_reference" ,
1213+ "ms_run"
1214+ ]
1215+ out_mztab_psh .loc [:, "opt_global_cv_MS:1002217_decoy_peptide" ] = 0
11821216 out_mztab_psh .loc [:, "PSM_ID" ] = out_mztab_psh .index
11831217 out_mztab_psh .loc [:, "unique" ] = out_mztab_psh .apply (
11841218 lambda x : "0" if ";" in str (x ["accession" ]) else "1" ,
0 commit comments