Skip to content

Commit e7db77f

Browse files
committed
added a new argument, -d and --delimiter, for users to specify what delimiter they are using in their FASTA and NEWICK tree files.
1 parent de75b06 commit e7db77f

File tree

40 files changed

+2200
-27
lines changed

40 files changed

+2200
-27
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,4 @@ coverage.unit:
3131
coverage.integration:
3232
rm tests/samples/OG0000010.renamed.fa.mafft.clipkit.orthosnap.*
3333
python -m pytest --basetemp=output --cov=./ -m "integration" --cov-report=xml:integration.coverage.xml
34-
rm tests/samples/OG0000010.renamed.fa.mafft.clipkit.orthosnap.*
34+
rm tests/samples/OG0000010.renamed.fa.mafft.clipkit.orthosnap.*

docs/change_log/index.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ Change log
88

99
Major changes to OrthoSNAP are summarized here.
1010

11+
**1.3.2**
12+
Added argument for user-defined delimiter (-d, \-\-delimiter)
13+
1114
**1.2.0**
1215
Added the -rih (\-\-report_inparalog_handling) function, which creates
1316
a summary file of which inparalogs where kept compared to trimmed

docs/usage/index.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,22 @@ To generate this file, use the following command:
9797
9898
|
9999
100+
Specifying the delimiter
101+
------------------------
102+
103+
As of version 1.3.2, OrthoSNAP supports an argument for user-defined delimiters between
104+
the taxon name and gene name.
105+
106+
By default, the delimiter is a pipe character (or "|"), but can be changed by setting the
107+
-d/\-\-delimiter argument. For example, if "-" is the current delimiter between taxon and
108+
gene names, use the following command:
109+
110+
.. code-block:: shell
111+
112+
$ orthosnap -f orthogroup_of_genes.faa -t phylogeny_of_orthogroup_of_genes.tre -d -
113+
114+
|
115+
100116
All options
101117
-----------
102118

@@ -115,6 +131,8 @@ All options
115131
+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
116132
| -r/\-\-roooted | boolean argument for whether the input phylogeny is already rooted (default: false) |
117133
+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
134+
| -d/\-\-delimiter | specify the delimiter found between taxon names and gene names (default: "|") |
135+
+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
118136
| -st/\-\-snap_trees | boolean argument for whether trees of SNAP-OGs should be outputted (default: false) |
119137
+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
120138
| -ip/\-\-inparalog_to_keep | determine which sequence to keep in the case of species-specific inparalogs using sequence- or tree-based options (default: longest_seq_len) |

orthosnap/args_processing.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ def process_args(args) -> dict:
1818
tree = args.tree
1919
fasta = args.fasta
2020

21+
delimiter = args.delimiter if args.delimiter is not None else "|"
22+
2123
if not os.path.isfile(tree):
2224
logger.warning("Input tree does not exist")
2325
sys.exit()
@@ -30,7 +32,7 @@ def process_args(args) -> dict:
3032
occupancy = (
3133
args.occupancy
3234
if args.occupancy is not None
33-
else determine_occupancy_threshold(fasta)
35+
else determine_occupancy_threshold(fasta, delimiter)
3436
)
3537

3638
if occupancy <= 0:
@@ -74,15 +76,16 @@ def process_args(args) -> dict:
7476
inparalog_to_keep=inparalog_to_keep,
7577
report_inparalog_handling=report_inparalog_handling,
7678
output_path=output_path,
79+
delimiter=delimiter,
7780
)
7881

7982

80-
def determine_occupancy_threshold(fasta: str) -> int:
83+
def determine_occupancy_threshold(fasta: str, delimiter: str) -> int:
8184
fasta = SeqIO.parse(fasta, "fasta")
8285
unique_names = []
8386

8487
for i in fasta:
85-
i = i.id.split("|", 1)[0]
88+
i = i.id.split(delimiter, 1)[0]
8689
if i not in unique_names:
8790
unique_names.append(i)
8891

orthosnap/helper.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from enum import Enum
44
import re
55
import statistics as stat
6+
import sys
67

78
from Bio import Phylo
89
from Bio import SeqIO
@@ -28,7 +29,11 @@ def collapse_low_support_bipartitions(newtree, support: float):
2829
return newtree
2930

3031

31-
def determine_if_dups_are_sister(subtree_tips: list, newtree):
32+
def determine_if_dups_are_sister(
33+
subtree_tips: list,
34+
newtree,
35+
delimiter: str,
36+
):
3237
"""
3338
determine if dups are sister to one another
3439
"""
@@ -41,7 +46,7 @@ def determine_if_dups_are_sister(subtree_tips: list, newtree):
4146
dup_tree = copy.deepcopy(newtree)
4247

4348
dup_tree = dup_tree.common_ancestor(subtree_tips)
44-
_, all_tips = get_all_tips_and_taxa_names(dup_tree)
49+
_, all_tips = get_all_tips_and_taxa_names(dup_tree, delimiter)
4550
if set(all_tips) != set(subtree_tips):
4651
are_sisters = False
4752

@@ -55,7 +60,7 @@ def determine_if_dups_are_sister(subtree_tips: list, newtree):
5560
return are_sisters
5661

5762

58-
def get_all_tips_and_taxa_names(tree):
63+
def get_all_tips_and_taxa_names(tree, delimiter: str):
5964
"""
6065
get all taxa and tip names in a phylogeny
6166
@@ -66,7 +71,11 @@ def get_all_tips_and_taxa_names(tree):
6671

6772
# loop through the tree and collect terminal names
6873
for term in tree.get_terminals():
69-
taxa_name = term.name[: term.name.index("|")]
74+
try:
75+
taxa_name = term.name[: term.name.index(delimiter)]
76+
except ValueError:
77+
print("\nERROR: Delimiter does not exist in FASTA headers.\nSpecify the delimiter using the -d argument.")
78+
sys.exit()
7079
if taxa_name not in taxa:
7180
taxa.append(taxa_name)
7281
all_tips.append(term.name)
@@ -86,15 +95,15 @@ def check_if_single_copy(taxa: list, all_tips: list):
8695
return False
8796

8897

89-
def get_tips_and_taxa_names_and_taxa_counts_from_subtrees(inter):
98+
def get_tips_and_taxa_names_and_taxa_counts_from_subtrees(inter, delimiter: str):
9099
"""
91100
get taxa, counts of each taxa, and all terminal names
92101
"""
93102
taxa_from_terms = []
94103
terms = []
95104
# get taxa and terminal names from subtree
96105
for term in inter.get_terminals():
97-
taxa_from_terms.append(term.name.split("|", 1)[0])
106+
taxa_from_terms.append(term.name.split(delimiter, 1)[0])
98107
terms.append(term.name)
99108
# count number of taxa in subtree
100109
counts_of_taxa_from_terms = Counter(taxa_from_terms)
@@ -145,6 +154,7 @@ def handle_multi_copy_subtree(
145154
output_path: str,
146155
inparalog_handling: dict,
147156
inparalog_handling_summary: dict,
157+
delimiter: str,
148158
):
149159
"""
150160
handling case where subtree contains all single copy genes
@@ -165,7 +175,9 @@ def handle_multi_copy_subtree(
165175

166176
# check if subtrees are sister to one another
167177
# are_sisters = determine_if_dups_are_sister(subtree_tips)
168-
are_sisters = determine_if_dups_are_sister(dups, newtree)
178+
are_sisters = determine_if_dups_are_sister(
179+
dups, newtree, delimiter
180+
)
169181

170182
# if duplicate sequences are sister, get the longest sequence
171183
if are_sisters:
@@ -179,7 +191,7 @@ def handle_multi_copy_subtree(
179191
# if the resulting subtree has only single copy genes
180192
# create a fasta file with sequences from tip labels
181193
_, _, _, counts = \
182-
get_tips_and_taxa_names_and_taxa_counts_from_subtrees(newtree)
194+
get_tips_and_taxa_names_and_taxa_counts_from_subtrees(newtree, delimiter)
183195

184196
if set(counts) == set([1]):
185197
(

orthosnap/orthosnap.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def execute(
3333
inparalog_to_keep: InparalogToKeep,
3434
report_inparalog_handling: bool,
3535
output_path: str,
36+
delimiter: str,
3637
):
3738
"""
3839
Master execute Function
@@ -58,6 +59,7 @@ def execute(
5859
inparalog_to_keep,
5960
report_inparalog_handling,
6061
output_path,
62+
delimiter,
6163
)
6264

6365
# create start time logger
@@ -67,7 +69,7 @@ def execute(
6769
tree, fasta_dict = read_input_files(tree, fasta, rooted)
6870

6971
# get list of all tip names and taxa names
70-
taxa, all_tips = get_all_tips_and_taxa_names(tree)
72+
taxa, all_tips = get_all_tips_and_taxa_names(tree, delimiter)
7173

7274
# check if the inputted phylogeny is already a single-copy tree.
7375
# if it is, exit
@@ -89,7 +91,9 @@ def execute(
8991
terms,
9092
counts_of_taxa_from_terms,
9193
counts,
92-
) = get_tips_and_taxa_names_and_taxa_counts_from_subtrees(inter)
94+
) = get_tips_and_taxa_names_and_taxa_counts_from_subtrees(
95+
inter, delimiter
96+
)
9397

9498
# create a copy of the input tree
9599
newtree = copy.deepcopy(tree)
@@ -142,6 +146,7 @@ def execute(
142146
output_path,
143147
inparalog_handling,
144148
inparalog_handling_summary,
149+
delimiter,
145150
)
146151

147152
if report_inparalog_handling:

orthosnap/parser.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,19 @@ def create_parser():
1919
usage=SUPPRESS,
2020
description=textwrap.dedent(
2121
f"""\
22-
_ _
23-
| | | |
24-
___ _ __| |_| |__ ___ ___ _ __ __ _ _ __
25-
/ _ \| '__| __| '_ \ / _ \/ __| '_ \ / _` | '_ \
26-
| (_) | | | |_| | | | (_) \__ \ | | | (_| | |_) |
27-
\___/|_| \__|_| |_|\___/|___/_| |_|\__,_| .__/
28-
| |
29-
|_|
22+
____ _ _ _____ _ _ _____
23+
/ __ \ | | | | / ____| \ | | /\ | __ \
24+
| | | |_ __| |_| |__ ___| (___ | \| | / \ | |__) |
25+
| | | | '__| __| '_ \ / _ \\___ \| . ` | / /\ \ | ___/
26+
| |__| | | | |_| | | | (_) |___) | |\ |/ ____ \| |
27+
\____/|_| \__|_| |_|\___/_____/|_| \_/_/ \_\_|
28+
3029
3130
Version: {__version__}
32-
Citation: Steenwyk et al.
33-
Usage: orthosnap <input> [optional arguments]
31+
Citation: Steenwyk et al. (2022), PLOS Biology. DOI: 10.1371/journal.pbio.3001827.
32+
https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3001827
33+
34+
Usage: orthosnap -f <fasta_file> -t <newick_tree_file> [optional arguments]
3435
"""
3536
),
3637
)
@@ -89,6 +90,10 @@ def create_parser():
8990
boolean argument for whether the input phylogeny is already rooted
9091
default: false
9192
93+
-d, --delimiter
94+
string for delimited between species and sequence name
95+
default: "|"
96+
9297
-st, --snap_trees
9398
boolean argument for whether trees of SNAP-OGs should be outputted
9499
default: false
@@ -149,6 +154,13 @@ def create_parser():
149154
- if used, the input phylogeny is assumed to be rooted; if not,
150155
the tree will be midpoint rooted
151156
157+
-d, --delimiter
158+
- separator between taxon name and gene sequence
159+
- the default is a pipe character (or "|")
160+
- for example, a FASTA header may be as follows:
161+
>species_A|gene0 to specify species_A and
162+
gene identifier gene_0
163+
152164
-st, --snap_trees
153165
- boolean argument for whether newick files of SNAP-OGs should also
154166
be outputted
@@ -205,6 +217,14 @@ def create_parser():
205217
help=SUPPRESS,
206218
)
207219

220+
optional.add_argument(
221+
"-d",
222+
"--delimiter",
223+
type=str,
224+
required=False,
225+
help=SUPPRESS,
226+
)
227+
208228
optional.add_argument(
209229
"-st",
210230
"--snap_trees",

orthosnap/writer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def write_user_args(
1515
inparalog_to_keep: InparalogToKeep,
1616
report_inparalog_handling: bool,
1717
output_path: str,
18+
delimiter: str,
1819
):
1920
"""
2021
Function to print user arguments to stdout
@@ -31,6 +32,7 @@ def write_user_args(
3132
Report inparalog handling: {report_inparalog_handling}
3233
Support threshold: {support}
3334
Taxon occupancy threshold: {occupancy}
35+
Delimiter: {delimiter}
3436
Output newick of SNAP-OGs: {snap_trees}
3537
Output directory: {output_path}
3638
"""

0 commit comments

Comments
 (0)