@@ -126,10 +126,6 @@ def get_subtree_tips(terms: list, name: str, tree):
126126 temp .append (term .name )
127127 subtree_tips .append (temp )
128128
129- # print(name)
130- # import sys
131- # sys.exit()
132-
133129 return subtree_tips , dups
134130
135131
@@ -147,6 +143,8 @@ def handle_multi_copy_subtree(
147143 snap_trees : bool ,
148144 inparalog_to_keep : InparalogToKeep ,
149145 output_path : str ,
146+ inparalog_handling : dict ,
147+ inparalog_handling_summary : dict ,
150148):
151149 """
152150 handling case where subtree contains all single copy genes
@@ -172,17 +170,22 @@ def handle_multi_copy_subtree(
172170 # if duplicate sequences are sister, get the longest sequence
173171 if are_sisters :
174172 # trim short sequences and keep long sequences in newtree
175- newtree , terms = inparalog_to_keep_determination (
176- newtree , fasta_dict , dups , terms , inparalog_to_keep
177- )
173+ newtree , terms , inparalog_handling = \
174+ inparalog_to_keep_determination (
175+ newtree , fasta_dict , dups , terms ,
176+ inparalog_to_keep , inparalog_handling
177+ )
178178
179179 # if the resulting subtree has only single copy genes
180180 # create a fasta file with sequences from tip labels
181- _ , _ , _ , counts = get_tips_and_taxa_names_and_taxa_counts_from_subtrees (newtree )
181+ _ , _ , _ , counts = \
182+ get_tips_and_taxa_names_and_taxa_counts_from_subtrees (newtree )
183+
182184 if set (counts ) == set ([1 ]):
183185 (
184186 subgroup_counter ,
185187 assigned_tips ,
188+ inparalog_handling_summary
186189 ) = write_output_fasta_and_account_for_assigned_tips_single_copy_case (
187190 fasta ,
188191 subgroup_counter ,
@@ -192,9 +195,13 @@ def handle_multi_copy_subtree(
192195 snap_trees ,
193196 newtree ,
194197 output_path ,
198+ inparalog_handling ,
199+ inparalog_handling_summary ,
195200 )
196201
197- return subgroup_counter , assigned_tips
202+ return \
203+ subgroup_counter , assigned_tips , \
204+ inparalog_handling , inparalog_handling_summary
198205
199206
200207def handle_single_copy_subtree (
@@ -208,6 +215,8 @@ def handle_single_copy_subtree(
208215 assigned_tips : list ,
209216 snap_trees : bool ,
210217 output_path : str ,
218+ inparalog_handling : dict ,
219+ inparalog_handling_summary : dict ,
211220):
212221 """
213222 handling case where subtree contains all single copy genes
@@ -223,6 +232,7 @@ def handle_single_copy_subtree(
223232 (
224233 subgroup_counter ,
225234 assigned_tips ,
235+ inparalog_handling_summary
226236 ) = write_output_fasta_and_account_for_assigned_tips_single_copy_case (
227237 fasta ,
228238 subgroup_counter ,
@@ -232,9 +242,13 @@ def handle_single_copy_subtree(
232242 snap_trees ,
233243 newtree ,
234244 output_path ,
245+ inparalog_handling ,
246+ inparalog_handling_summary ,
235247 )
236248
237- return subgroup_counter , assigned_tips
249+ return \
250+ subgroup_counter , assigned_tips , \
251+ inparalog_handling , inparalog_handling_summary
238252
239253
240254def inparalog_to_keep_determination (
@@ -243,6 +257,7 @@ def inparalog_to_keep_determination(
243257 dups : list ,
244258 terms : list ,
245259 inparalog_to_keep : InparalogToKeep ,
260+ inparalog_handling : dict ,
246261):
247262 """
248263 remove_short_sequences_among_duplicates_that_are_sister
@@ -261,22 +276,27 @@ def inparalog_to_keep_determination(
261276 seq_to_keep = min (lengths , key = lengths .get )
262277 elif len (lengths ) > 2 and inparalog_to_keep .value == "median_seq_len" :
263278 median_len = stat .median (lengths , key = lengths .get )
264- seq_to_keep = [key for key , value in lengths if value == median_len ]
279+ seq_to_keep = [
280+ key for key , value in lengths if value == median_len
281+ ]
265282 elif len (lengths ) == 2 and inparalog_to_keep .value == "median_seq_len" :
266283 seq_to_keep = max (lengths , key = lengths .get )
267284 elif inparalog_to_keep .value == "longest_seq_len" :
268285 seq_to_keep = max (lengths , key = lengths .get )
269-
270286 # keep inparalog based on tip to root length
271287 else :
272288 for dup in dups :
273289 lengths [dup ] = TreeMixin .distance (newtree , dup )
274290 if inparalog_to_keep .value == "shortest_branch_len" :
275291 seq_to_keep = min (lengths , key = lengths .get )
276- elif len (lengths ) > 2 and inparalog_to_keep .value == "median_branch_len" :
292+ elif len (lengths ) > 2 and \
293+ inparalog_to_keep .value == "median_branch_len" :
277294 median_len = stat .median (lengths , key = lengths .get )
278- seq_to_keep = [key for key , value in lengths if value == median_len ]
279- elif len (lengths ) == 2 and inparalog_to_keep .value == "median_branch_len" :
295+ seq_to_keep = [
296+ key for key , value in lengths if value == median_len
297+ ]
298+ elif len (lengths ) == 2 and \
299+ inparalog_to_keep .value == "median_branch_len" :
280300 seq_to_keep = max (lengths , key = lengths .get )
281301 elif inparalog_to_keep .value == "longest_branch_len" :
282302 seq_to_keep = max (lengths , key = lengths .get )
@@ -288,15 +308,20 @@ def inparalog_to_keep_determination(
288308 newtree .prune (seq_name )
289309 terms .remove (seq_name )
290310
291- return newtree , terms
311+ dups .remove (seq_to_keep )
312+ inparalog_handling [seq_to_keep ] = dups
313+
314+ return newtree , terms , inparalog_handling
292315
293316
294317def prune_subtree (all_tips : list , terms : list , newtree ):
295318 """
296319 prune tips not of interest for subtree
297320 """
298321
299- tips_to_prune = [i for i in all_tips + terms if i not in all_tips or i not in terms ]
322+ tips_to_prune = [
323+ i for i in all_tips + terms if i not in all_tips or i not in terms
324+ ]
300325
301326 for tip in tips_to_prune :
302327 newtree .prune (tip )
@@ -328,6 +353,8 @@ def write_output_fasta_and_account_for_assigned_tips_single_copy_case(
328353 snap_tree : bool ,
329354 newtree ,
330355 output_path : str ,
356+ inparalog_handling : dict ,
357+ inparalog_handling_summary : dict
331358):
332359 # write output
333360 fasta_path_stripped = re .sub ("^.*/" , "" , fasta )
@@ -345,6 +372,44 @@ def write_output_fasta_and_account_for_assigned_tips_single_copy_case(
345372 )
346373 Phylo .write (newtree , output_file_name , "newick" )
347374
375+ write_summary_file_with_inparalog_handling (
376+ inparalog_handling , fasta ,
377+ output_path , subgroup_counter ,
378+ assigned_tips
379+ )
348380 subgroup_counter += 1
349381
350- return subgroup_counter , assigned_tips
382+ return subgroup_counter , assigned_tips , inparalog_handling_summary
383+
384+
385+ def write_summary_file_with_inparalog_handling (
386+ inparalog_handling : dict ,
387+ fasta : str ,
388+ output_path : str ,
389+ subgroup_count : int ,
390+ assigned_tips : list
391+ ):
392+ res_arr = []
393+
394+ in_file_handle = re .sub ("^.*/" , "" , fasta )
395+
396+ for k , v in inparalog_handling .items ():
397+ temp = []
398+ temp .append (in_file_handle + ".orthosnap." + str (subgroup_count ))
399+ temp .append (k )
400+ temp .append (';' .join (v ))
401+ res_arr .append (temp )
402+ inparalog_report_output_name = in_file_handle + ".inparalog_report.txt"
403+
404+ fasta_path_stripped = re .sub ("^.*/" , "" , fasta )
405+ output_fasta_file_name = (
406+ f"{ output_path } /{ fasta_path_stripped } .orthosnap.{ subgroup_count } .fa"
407+ )
408+
409+ if res_arr :
410+ try :
411+ if res_arr [0 ][1 ] in open (output_fasta_file_name ).read ():
412+ with open (f"{ output_path } { inparalog_report_output_name } " , "a" ) as file :
413+ file .writelines ('\t ' .join (i ) + '\n ' for i in res_arr )
414+ except FileNotFoundError :
415+ 1
0 commit comments