Skip to content

Commit 6da24ad

Browse files
committed
Feat: Adding groups and contrasts file parser.
1 parent 27ce654 commit 6da24ad

File tree

1 file changed

+338
-0
lines changed

1 file changed

+338
-0
lines changed

rna-seek

Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,20 @@ def _cp_r_safe_(source, target, resources = []):
223223
copytree(os.path.join(source, resource), destination)
224224

225225

226+
def clean(s, remove=['"', "'"]):
227+
"""Cleans a string to remove any defined leading or trailing characters.
228+
@param s <str>:
229+
String to clean.
230+
@param remove list[<str>]:
231+
List of characters to remove from beginning or end of string 's'.
232+
@return s <str>:
233+
Cleaned string
234+
"""
235+
for c in remove:
236+
s = s.strip(c)
237+
return s
238+
239+
226240
def rename(filename):
227241
"""Dynamically renames FastQ file to have one of the following extensions: *.R1.fastq.gz, *.R2.fastq.gz
228242
To automatically rename the fastq files, a few assumptions are made. If the extension of the
@@ -869,6 +883,330 @@ def resolve_additional_bind_paths(search_paths):
869883
return list(set(common_paths))
870884

871885

886+
def check_file(file, ncols, delim='\t'):
887+
"""Checks if a file is empty and has the expected number of columns.
888+
@param file <str>:
889+
Path to file to check file.
890+
@param ncols <int>:
891+
Number of expected columns, i.e. 2
892+
@param delim <str>:
893+
Delimiter of file, i.e. a tab
894+
@return header list[<str>]:
895+
Header of file represented as a list
896+
"""
897+
# Check to see if the file is empty
898+
c = Colors()
899+
with open(file, 'r') as fh:
900+
try:
901+
header = [clean(col.strip()) for col in next(fh).strip().split(delim)]
902+
except StopIteration:
903+
err('{0}{1}Error: --groups "{2}" file is empty!{3}'.format(c.bg_red, c.white, file, c.end))
904+
fatal('{0}{1}Please add Sample and Group information to the file and try again.{2}'.format(c.bg_red, c.white, c.end))
905+
# Check for expected number of columns
906+
# on each line, report all problematic
907+
# lines and then exit 1 if any errors
908+
errors = False
909+
with open(file, 'r') as fh:
910+
linenumber = 0
911+
for line in fh:
912+
linenumber += 1
913+
linelist = [l.strip() for l in line.split(delim) if l.strip()]
914+
if len(linelist) == 0:
915+
# Skip over lines with the expected
916+
# number of columns and empty line
917+
continue
918+
if len(linelist) < ncols:
919+
# Line is missing a column or it is not
920+
# tab delimited
921+
errors = True
922+
err('{0}{1}Error: --groups "{2}" file does not contain the expected number of columns at line {3}! {4}'.format(
923+
c.bg_red, c.white, file, linenumber, c.end
924+
))
925+
err('{0}{1} └── Bad line contents: "{2}" {3}'.format(c.bg_red, c.white, line.rstrip(), c.end))
926+
if errors:
927+
fatal('{0}{1}Fatal: Please correct these lines and check your file is tab-delimited! {3}'.format(
928+
c.bg_red, c.white, c.end
929+
))
930+
return header
931+
932+
933+
def index(file, delim='\t', required = ['Sample', 'Group']):
934+
"""Returns the index of expected columns in provided file. The groups
935+
file is expected to have the following required columns.
936+
@Required columns:
937+
- Sample, Group
938+
@param file <str>:
939+
Path to groups TSV file.
940+
@return tuple(indices <dict[int/None]>, hasHeader <boolean>):
941+
[0] Dictionary containing information the index of each required/optional column
942+
[1] Boolean to indicate whether file has a header
943+
"""
944+
c = Colors()
945+
indices = {}
946+
has_header = True
947+
# Check to see if the file is empty,
948+
# and it has the expected number of
949+
# columns, i.e 2
950+
header = check_file(file = file, ncols = len(required), delim = delim)
951+
# Parse the header to get the index of
952+
# required fields: i.e Sample, Group
953+
# columns for parsing later
954+
error = False
955+
for col in required:
956+
try: indices[col] = header.index(col)
957+
except ValueError:
958+
error = True
959+
# Missing column names or header in groups file
960+
# This can also occur if the file is not actually
961+
# a tab delimited file.
962+
# TODO: Add a check to see if the file is actually
963+
# a tab delimited file, i.e. a TSV file.
964+
has_header = False
965+
err('{0}{1}Error: groups file is missing the following required column name: {2}{3}'.format(
966+
c.bg_red, c.white, col, c.end
967+
))
968+
# Check for errors and properly
969+
# formatted groups file
970+
if error or (indices.get('Sample',-1)!=0 or indices.get('Group',-1)!=1):
971+
# Exit with non-zero exit code
972+
fatal(textwrap.dedent(
973+
"""
974+
# Fatal: Detected improperly formatted groups file!
975+
# Here example of a groups file where:
976+
# • 1st column = Sample (required): Base name of a
977+
# sample's BAM file without ".bam"
978+
# file extension.
979+
# • 2nd column = Group (required): A sample's group
980+
# information, any group listed here
981+
# must match information provided in
982+
# the contrasts file.
983+
# • Nth column = Extra Covariates (optional): Any
984+
# additional columns after the 1st &
985+
# 2nd column are used to control for
986+
# confounders or covariates during
987+
# differential splicing analysis.
988+
# Note: Numeric values will be
989+
# treated as continuous variables so
990+
# please ensure any categorical values
991+
# start with a letter to ensure they
992+
# are modelled correctly!
993+
# Please ensure your file is tab delimited,
994+
# check your columns names for the first and
995+
# second columns, i.e 1st=Sample, 2nd=Group.
996+
# Please see example below:
997+
Sample Group Sex
998+
Sample_1 G1 F
999+
Sample_2 G1 M
1000+
Sample_3 G1 F
1001+
Sample_4 G2 M
1002+
Sample_5 G2 F
1003+
Sample_6 G2 M
1004+
Sample_7 G3 F
1005+
Sample_8 G3 M
1006+
Sample_9 G3 F
1007+
Sample_10 G3 M
1008+
"""
1009+
))
1010+
return indices, has_header
1011+
1012+
1013+
def groups(file, delim='\t'):
1014+
"""Reads and parses a sample sheet, groups.tsv, into a dictionary.
1015+
This file acts as a sample sheet to gather sample metadata and define
1016+
relationship betweeen groups of samples. This tab delimited file
1017+
contains two columns. One column for the basename of the sample and
1018+
lastly, one column for the name of the sample's group. This group
1019+
information is used downstream in the pipeline for differential
1020+
expression rules. Comparisons between groups can be made with a
1021+
constrast.tsv file. This function returns a tuple containing a
1022+
dictionary containing group to sample list.
1023+
@param file <str>:
1024+
Path to groups TSV file.
1025+
@return groups <dict[str]>:
1026+
Dictionary containing group to samples, where each key is group
1027+
and its value is a list of samples belonging to that group
1028+
@Example: group.tsv
1029+
Sample Group
1030+
Sample_1 G1
1031+
Sample_2 G1
1032+
Sample_3 G1
1033+
Sample_4 G2
1034+
Sample_5 G2
1035+
Sample_6 G2
1036+
Sample_7 G3
1037+
Sample_8 G3
1038+
Sample_9 G3
1039+
Sample_0 G3
1040+
# Example data structure that is return given the file above
1041+
>> groups
1042+
{
1043+
'G1': ['Sample_1', 'Sample_2', 'Sample_3'],
1044+
'G2': ['Sample_4', 'Sample_5', 'Sample_6'],
1045+
'G3': ['Sample_7', 'Sample_8', 'Sample_9', 'Sample_0']
1046+
}
1047+
"""
1048+
# Get index of each required and
1049+
# optional column and determine if
1050+
# the file has a header
1051+
indices, header = index(file)
1052+
s_index = indices['Sample']
1053+
g_index = indices['Group']
1054+
# Parse sample and group
1055+
# information
1056+
groups = {}
1057+
invalid_group = False
1058+
with open(file, 'r') as fh:
1059+
if header:
1060+
# Skip over header
1061+
tmp = next(fh)
1062+
for line in fh:
1063+
linelist = [clean(l.strip()) for l in line.split(delim)]
1064+
# Parse Sample information
1065+
try:
1066+
sample_name = linelist[s_index]
1067+
if not sample_name: continue # skip over empty string
1068+
except IndexError:
1069+
continue # No sample information, skip over line
1070+
# Parse Group Information
1071+
try:
1072+
group = linelist[g_index]
1073+
if not group: continue # skip over empty string
1074+
except IndexError:
1075+
continue # No group information, skip over line
1076+
# Check valid group names, names
1077+
# should start with a letter and
1078+
# can this contain the following
1079+
# characters:
1080+
# letters, numbers, underscores, periods
1081+
if not re.match("^[A-Za-z][A-Za-z0-9_.]*$", group):
1082+
invalid_group = True
1083+
c = Colors()
1084+
err('{0}{1}Error: Invalid group name "{2}" for sample "{3}"!{4}'.format(
1085+
c.bg_red, c.white, group, sample_name, c.end
1086+
))
1087+
err('{0}{1} └── Names must start with a letter and can only contain letters, numbers, underscores (_), and periods (.){2}'.format(
1088+
c.bg_red, c.white, c.end
1089+
))
1090+
# Add sample to its group
1091+
if group not in groups:
1092+
groups[group] = []
1093+
if sample_name not in groups[group]:
1094+
groups[group].append(sample_name)
1095+
if invalid_group:
1096+
fatal('{0}{1}Fatal: Please update the group names in "{2}" and try again!{3}'.format(
1097+
c.bg_red, c.white, file, c.end
1098+
))
1099+
return groups
1100+
1101+
1102+
def extract_group_tokens(expr):
1103+
"""Extracts group names from an expression. Expressions can be relatively
1104+
complex, i.e "(G2-G2)", "(G3+G4)/2", "((G2-G2)-((G3+G4)/2))", or simple
1105+
like "G2". This function uses a regular expression to tokenize the expression
1106+
and extract group names while ignoring other operators and function names.
1107+
@param expr <str>:
1108+
Group expression string, i.e "(G2-G2)", "(G3+G4)/2", "G2"
1109+
@return out <list[str]>:
1110+
List of group names in expression
1111+
"""
1112+
# Tokenize expression
1113+
_TOKEN_RE = re.compile(r"""
1114+
(?P<NUMBER> (?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)? ) |
1115+
(?P<ID> [A-Za-z][A-Za-z0-9._]* ) |
1116+
(?P<OP> \*\*|//|[+\-*/^:] ) |
1117+
(?P<LPAREN> \() |
1118+
(?P<RPAREN> \) ) |
1119+
(?P<WS> \s+ ) |
1120+
(?P<MISMATCH> . )
1121+
""", re.VERBOSE)
1122+
# Find all ID tokens that are not
1123+
# operators or function names
1124+
toks = list(_TOKEN_RE.finditer(expr))
1125+
out, seen = [], set()
1126+
for i, t in enumerate(toks):
1127+
if t.lastgroup != "ID":
1128+
continue
1129+
# exclude function names: next non-WS token is '('
1130+
j = i + 1
1131+
while j < len(toks) and toks[j].lastgroup == "WS":
1132+
j += 1
1133+
if j < len(toks) and toks[j].lastgroup == "LPAREN":
1134+
continue
1135+
name = t.group()
1136+
if name not in seen:
1137+
seen.add(name)
1138+
out.append(name)
1139+
return out
1140+
1141+
1142+
def contrasts(file, groups, delim='\t'):
1143+
"""Reads and parses the group comparison file, contrasts.tsv, into a
1144+
dictionary. This file acts as a config file to setup contrasts between
1145+
two groups, where groups of samples are defined in the groups.tsv file.
1146+
This information is used in differential analysis, like differential binding
1147+
analysis, differential gene expression analysis, etc.
1148+
@Example: contrasts.tsv
1149+
G2 G1
1150+
G4 G3
1151+
G5 G1
1152+
>> contrasts = contrasts('contrasts.tsv', groups = ['G1', 'G2', 'G3'])
1153+
>> contrasts
1154+
[
1155+
["G2", "G1"],
1156+
["G3", "G1"],
1157+
["G3", "G2"]
1158+
]
1159+
@param file <str>:
1160+
Path to contrasts TSV file.
1161+
@param groups list[<str>]:
1162+
List of groups defined in the groups file, enforces groups exist.
1163+
@return comparisons <list[list[str, str]]>:
1164+
Nested list contain comparsions of interest.
1165+
"""
1166+
c = Colors()
1167+
errors = []
1168+
comparsions = []
1169+
line_number = 0
1170+
with open(file) as fh:
1171+
for line in fh:
1172+
line_number += 1
1173+
linelist = [clean(l.strip()) for l in line.split(delim)]
1174+
try:
1175+
g1 = linelist[0] # Case group
1176+
g2 = linelist[1] # Control group
1177+
if not g1 or not g2: continue # skip over empty lines
1178+
except IndexError:
1179+
# Missing a group, need two groups to tango
1180+
# This can happen if the file is NOT a TSV file,
1181+
# and it is seperated by white spaces, :(
1182+
err(
1183+
'{0}{1}Warning: {2} is missing at least one group on line {3}: {4}{5}'.format(
1184+
c.bg_yellow, c.black, file, line_number, line.strip(), c.end
1185+
))
1186+
err('{0}{1} └── Skipping over line, check if line is tab seperated... {2}'.format(
1187+
c.bg_yellow, c.black, c.end
1188+
))
1189+
continue
1190+
# Check to see if groups where defined already,
1191+
# avoids user errors and spelling errors
1192+
for g in [*extract_group_tokens(g1), *extract_group_tokens(g2)]:
1193+
if g not in groups:
1194+
# Collect all error and report them at end
1195+
errors.append(g)
1196+
# Add comparsion to list of comparisons
1197+
if [g1, g2] not in comparsions:
1198+
comparsions.append([g1, g2])
1199+
if errors:
1200+
# One of the groups is not defined in groups
1201+
err('{0}{1}Error: the following group(s) in "{2}" are not defined in groups file! {3}'.format(
1202+
c.bg_red, c.white, file, c.end
1203+
))
1204+
fatal('{0}{1} └── {2} {3}'.format(
1205+
c.bg_red, c.white, ','.join(errors), c.end
1206+
))
1207+
return comparsions
1208+
1209+
8721210
def run(sub_args):
8731211
"""Initialize, setup, and run the RNA-seek pipeline.
8741212
Calls initialize() to create output directory and copy over pipeline resources,

0 commit comments

Comments
 (0)