@@ -223,6 +223,20 @@ def _cp_r_safe_(source, target, resources = []):
223223 copytree (os .path .join (source , resource ), destination )
224224
225225
226+ def clean (s , remove = ['"' , "'" ]):
227+ """Cleans a string to remove any defined leading or trailing characters.
228+ @param s <str>:
229+ String to clean.
230+ @param remove list[<str>]:
231+ List of characters to remove from beginning or end of string 's'.
232+ @return s <str>:
233+ Cleaned string
234+ """
235+ for c in remove :
236+ s = s .strip (c )
237+ return s
238+
239+
226240def rename (filename ):
227241 """Dynamically renames FastQ file to have one of the following extensions: *.R1.fastq.gz, *.R2.fastq.gz
228242 To automatically rename the fastq files, a few assumptions are made. If the extension of the
@@ -869,6 +883,330 @@ def resolve_additional_bind_paths(search_paths):
869883 return list (set (common_paths ))
870884
871885
886+ def check_file (file , ncols , delim = '\t ' ):
887+ """Checks if a file is empty and has the expected number of columns.
888+ @param file <str>:
889+ Path to file to check file.
890+ @param ncols <int>:
891+ Number of expected columns, i.e. 2
892+ @param delim <str>:
893+ Delimiter of file, i.e. a tab
894+ @return header list[<str>]:
895+ Header of file represented as a list
896+ """
897+ # Check to see if the file is empty
898+ c = Colors ()
899+ with open (file , 'r' ) as fh :
900+ try :
901+ header = [clean (col .strip ()) for col in next (fh ).strip ().split (delim )]
902+ except StopIteration :
903+ err ('{0}{1}Error: --groups "{2}" file is empty!{3}' .format (c .bg_red , c .white , file , c .end ))
904+ fatal ('{0}{1}Please add Sample and Group information to the file and try again.{2}' .format (c .bg_red , c .white , c .end ))
905+ # Check for expected number of columns
906+ # on each line, report all problematic
907+ # lines and then exit 1 if any errors
908+ errors = False
909+ with open (file , 'r' ) as fh :
910+ linenumber = 0
911+ for line in fh :
912+ linenumber += 1
913+ linelist = [l .strip () for l in line .split (delim ) if l .strip ()]
914+ if len (linelist ) == 0 :
915+ # Skip over lines with the expected
916+ # number of columns and empty line
917+ continue
918+ if len (linelist ) < ncols :
919+ # Line is missing a column or it is not
920+ # tab delimited
921+ errors = True
922+ err ('{0}{1}Error: --groups "{2}" file does not contain the expected number of columns at line {3}! {4}' .format (
923+ c .bg_red , c .white , file , linenumber , c .end
924+ ))
925+ err ('{0}{1} └── Bad line contents: "{2}" {3}' .format (c .bg_red , c .white , line .rstrip (), c .end ))
926+ if errors :
927+ fatal ('{0}{1}Fatal: Please correct these lines and check your file is tab-delimited! {3}' .format (
928+ c .bg_red , c .white , c .end
929+ ))
930+ return header
931+
932+
933+ def index (file , delim = '\t ' , required = ['Sample' , 'Group' ]):
934+ """Returns the index of expected columns in provided file. The groups
935+ file is expected to have the following required columns.
936+ @Required columns:
937+ - Sample, Group
938+ @param file <str>:
939+ Path to groups TSV file.
940+ @return tuple(indices <dict[int/None]>, hasHeader <boolean>):
941+ [0] Dictionary containing information the index of each required/optional column
942+ [1] Boolean to indicate whether file has a header
943+ """
944+ c = Colors ()
945+ indices = {}
946+ has_header = True
947+ # Check to see if the file is empty,
948+ # and it has the expected number of
949+ # columns, i.e 2
950+ header = check_file (file = file , ncols = len (required ), delim = delim )
951+ # Parse the header to get the index of
952+ # required fields: i.e Sample, Group
953+ # columns for parsing later
954+ error = False
955+ for col in required :
956+ try : indices [col ] = header .index (col )
957+ except ValueError :
958+ error = True
959+ # Missing column names or header in groups file
960+ # This can also occur if the file is not actually
961+ # a tab delimited file.
962+ # TODO: Add a check to see if the file is actually
963+ # a tab delimited file, i.e. a TSV file.
964+ has_header = False
965+ err ('{0}{1}Error: groups file is missing the following required column name: {2}{3}' .format (
966+ c .bg_red , c .white , col , c .end
967+ ))
968+ # Check for errors and properly
969+ # formatted groups file
970+ if error or (indices .get ('Sample' ,- 1 )!= 0 or indices .get ('Group' ,- 1 )!= 1 ):
971+ # Exit with non-zero exit code
972+ fatal (textwrap .dedent (
973+ """
974+ # Fatal: Detected improperly formatted groups file!
975+ # Here example of a groups file where:
976+ # • 1st column = Sample (required): Base name of a
977+ # sample's BAM file without ".bam"
978+ # file extension.
979+ # • 2nd column = Group (required): A sample's group
980+ # information, any group listed here
981+ # must match information provided in
982+ # the contrasts file.
983+ # • Nth column = Extra Covariates (optional): Any
984+ # additional columns after the 1st &
985+ # 2nd column are used to control for
986+ # confounders or covariates during
987+ # differential splicing analysis.
988+ # Note: Numeric values will be
989+ # treated as continuous variables so
990+ # please ensure any categorical values
991+ # start with a letter to ensure they
992+ # are modelled correctly!
993+ # Please ensure your file is tab delimited,
994+ # check your columns names for the first and
995+ # second columns, i.e 1st=Sample, 2nd=Group.
996+ # Please see example below:
997+ Sample Group Sex
998+ Sample_1 G1 F
999+ Sample_2 G1 M
1000+ Sample_3 G1 F
1001+ Sample_4 G2 M
1002+ Sample_5 G2 F
1003+ Sample_6 G2 M
1004+ Sample_7 G3 F
1005+ Sample_8 G3 M
1006+ Sample_9 G3 F
1007+ Sample_10 G3 M
1008+ """
1009+ ))
1010+ return indices , has_header
1011+
1012+
1013+ def groups (file , delim = '\t ' ):
1014+ """Reads and parses a sample sheet, groups.tsv, into a dictionary.
1015+ This file acts as a sample sheet to gather sample metadata and define
1016+ relationship betweeen groups of samples. This tab delimited file
1017+ contains two columns. One column for the basename of the sample and
1018+ lastly, one column for the name of the sample's group. This group
1019+ information is used downstream in the pipeline for differential
1020+ expression rules. Comparisons between groups can be made with a
1021+ constrast.tsv file. This function returns a tuple containing a
1022+ dictionary containing group to sample list.
1023+ @param file <str>:
1024+ Path to groups TSV file.
1025+ @return groups <dict[str]>:
1026+ Dictionary containing group to samples, where each key is group
1027+ and its value is a list of samples belonging to that group
1028+ @Example: group.tsv
1029+ Sample Group
1030+ Sample_1 G1
1031+ Sample_2 G1
1032+ Sample_3 G1
1033+ Sample_4 G2
1034+ Sample_5 G2
1035+ Sample_6 G2
1036+ Sample_7 G3
1037+ Sample_8 G3
1038+ Sample_9 G3
1039+ Sample_0 G3
1040+ # Example data structure that is return given the file above
1041+ >> groups
1042+ {
1043+ 'G1': ['Sample_1', 'Sample_2', 'Sample_3'],
1044+ 'G2': ['Sample_4', 'Sample_5', 'Sample_6'],
1045+ 'G3': ['Sample_7', 'Sample_8', 'Sample_9', 'Sample_0']
1046+ }
1047+ """
1048+ # Get index of each required and
1049+ # optional column and determine if
1050+ # the file has a header
1051+ indices , header = index (file )
1052+ s_index = indices ['Sample' ]
1053+ g_index = indices ['Group' ]
1054+ # Parse sample and group
1055+ # information
1056+ groups = {}
1057+ invalid_group = False
1058+ with open (file , 'r' ) as fh :
1059+ if header :
1060+ # Skip over header
1061+ tmp = next (fh )
1062+ for line in fh :
1063+ linelist = [clean (l .strip ()) for l in line .split (delim )]
1064+ # Parse Sample information
1065+ try :
1066+ sample_name = linelist [s_index ]
1067+ if not sample_name : continue # skip over empty string
1068+ except IndexError :
1069+ continue # No sample information, skip over line
1070+ # Parse Group Information
1071+ try :
1072+ group = linelist [g_index ]
1073+ if not group : continue # skip over empty string
1074+ except IndexError :
1075+ continue # No group information, skip over line
1076+ # Check valid group names, names
1077+ # should start with a letter and
1078+ # can this contain the following
1079+ # characters:
1080+ # letters, numbers, underscores, periods
1081+ if not re .match ("^[A-Za-z][A-Za-z0-9_.]*$" , group ):
1082+ invalid_group = True
1083+ c = Colors ()
1084+ err ('{0}{1}Error: Invalid group name "{2}" for sample "{3}"!{4}' .format (
1085+ c .bg_red , c .white , group , sample_name , c .end
1086+ ))
1087+ err ('{0}{1} └── Names must start with a letter and can only contain letters, numbers, underscores (_), and periods (.){2}' .format (
1088+ c .bg_red , c .white , c .end
1089+ ))
1090+ # Add sample to its group
1091+ if group not in groups :
1092+ groups [group ] = []
1093+ if sample_name not in groups [group ]:
1094+ groups [group ].append (sample_name )
1095+ if invalid_group :
1096+ fatal ('{0}{1}Fatal: Please update the group names in "{2}" and try again!{3}' .format (
1097+ c .bg_red , c .white , file , c .end
1098+ ))
1099+ return groups
1100+
1101+
1102+ def extract_group_tokens (expr ):
1103+ """Extracts group names from an expression. Expressions can be relatively
1104+ complex, i.e "(G2-G2)", "(G3+G4)/2", "((G2-G2)-((G3+G4)/2))", or simple
1105+ like "G2". This function uses a regular expression to tokenize the expression
1106+ and extract group names while ignoring other operators and function names.
1107+ @param expr <str>:
1108+ Group expression string, i.e "(G2-G2)", "(G3+G4)/2", "G2"
1109+ @return out <list[str]>:
1110+ List of group names in expression
1111+ """
1112+ # Tokenize expression
1113+ _TOKEN_RE = re .compile (r"""
1114+ (?P<NUMBER> (?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)? ) |
1115+ (?P<ID> [A-Za-z][A-Za-z0-9._]* ) |
1116+ (?P<OP> \*\*|//|[+\-*/^:] ) |
1117+ (?P<LPAREN> \() |
1118+ (?P<RPAREN> \) ) |
1119+ (?P<WS> \s+ ) |
1120+ (?P<MISMATCH> . )
1121+ """ , re .VERBOSE )
1122+ # Find all ID tokens that are not
1123+ # operators or function names
1124+ toks = list (_TOKEN_RE .finditer (expr ))
1125+ out , seen = [], set ()
1126+ for i , t in enumerate (toks ):
1127+ if t .lastgroup != "ID" :
1128+ continue
1129+ # exclude function names: next non-WS token is '('
1130+ j = i + 1
1131+ while j < len (toks ) and toks [j ].lastgroup == "WS" :
1132+ j += 1
1133+ if j < len (toks ) and toks [j ].lastgroup == "LPAREN" :
1134+ continue
1135+ name = t .group ()
1136+ if name not in seen :
1137+ seen .add (name )
1138+ out .append (name )
1139+ return out
1140+
1141+
1142+ def contrasts (file , groups , delim = '\t ' ):
1143+ """Reads and parses the group comparison file, contrasts.tsv, into a
1144+ dictionary. This file acts as a config file to setup contrasts between
1145+ two groups, where groups of samples are defined in the groups.tsv file.
1146+ This information is used in differential analysis, like differential binding
1147+ analysis, differential gene expression analysis, etc.
1148+ @Example: contrasts.tsv
1149+ G2 G1
1150+ G4 G3
1151+ G5 G1
1152+ >> contrasts = contrasts('contrasts.tsv', groups = ['G1', 'G2', 'G3'])
1153+ >> contrasts
1154+ [
1155+ ["G2", "G1"],
1156+ ["G3", "G1"],
1157+ ["G3", "G2"]
1158+ ]
1159+ @param file <str>:
1160+ Path to contrasts TSV file.
1161+ @param groups list[<str>]:
1162+ List of groups defined in the groups file, enforces groups exist.
1163+ @return comparisons <list[list[str, str]]>:
1164+ Nested list contain comparsions of interest.
1165+ """
1166+ c = Colors ()
1167+ errors = []
1168+ comparsions = []
1169+ line_number = 0
1170+ with open (file ) as fh :
1171+ for line in fh :
1172+ line_number += 1
1173+ linelist = [clean (l .strip ()) for l in line .split (delim )]
1174+ try :
1175+ g1 = linelist [0 ] # Case group
1176+ g2 = linelist [1 ] # Control group
1177+ if not g1 or not g2 : continue # skip over empty lines
1178+ except IndexError :
1179+ # Missing a group, need two groups to tango
1180+ # This can happen if the file is NOT a TSV file,
1181+ # and it is seperated by white spaces, :(
1182+ err (
1183+ '{0}{1}Warning: {2} is missing at least one group on line {3}: {4}{5}' .format (
1184+ c .bg_yellow , c .black , file , line_number , line .strip (), c .end
1185+ ))
1186+ err ('{0}{1} └── Skipping over line, check if line is tab seperated... {2}' .format (
1187+ c .bg_yellow , c .black , c .end
1188+ ))
1189+ continue
1190+ # Check to see if groups where defined already,
1191+ # avoids user errors and spelling errors
1192+ for g in [* extract_group_tokens (g1 ), * extract_group_tokens (g2 )]:
1193+ if g not in groups :
1194+ # Collect all error and report them at end
1195+ errors .append (g )
1196+ # Add comparsion to list of comparisons
1197+ if [g1 , g2 ] not in comparsions :
1198+ comparsions .append ([g1 , g2 ])
1199+ if errors :
1200+ # One of the groups is not defined in groups
1201+ err ('{0}{1}Error: the following group(s) in "{2}" are not defined in groups file! {3}' .format (
1202+ c .bg_red , c .white , file , c .end
1203+ ))
1204+ fatal ('{0}{1} └── {2} {3}' .format (
1205+ c .bg_red , c .white , ',' .join (errors ), c .end
1206+ ))
1207+ return comparsions
1208+
1209+
8721210def run (sub_args ):
8731211 """Initialize, setup, and run the RNA-seek pipeline.
8741212 Calls initialize() to create output directory and copy over pipeline resources,
0 commit comments