|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: UTF-8 -*- |
| 3 | + |
| 4 | +# Python standard library |
| 5 | +from __future__ import print_function |
| 6 | +import csv, sys |
| 7 | + |
| 8 | +# Local imports |
| 9 | +from utils import ( |
| 10 | + Colors, |
| 11 | + err, |
| 12 | + fatal |
| 13 | +) |
| 14 | + |
| 15 | +# Constants |
| 16 | +_C = Colors() |
| 17 | +# Required sample sheet field names |
| 18 | +REQUIRED_SAMPLE_SHEET_COLUMNS = [ |
| 19 | + "sample", |
| 20 | + "fastqs", |
| 21 | + "cytaimage", |
| 22 | + "slide", |
| 23 | + "area" |
| 24 | +] |
| 25 | +# Optional sample sheet field names, |
| 26 | +# with their default values, most |
| 27 | +# of which are None. The id field |
| 28 | +# is set to the required sample field |
| 29 | +# if it is provided. |
| 30 | +OPTIONAL_SAMPLE_SHEET_COLUMNS = [ |
| 31 | + "id", |
| 32 | + "image", |
| 33 | + "darkimage", |
| 34 | + "colorizedimage", |
| 35 | + "loupe_alignment", |
| 36 | + "barcode_csv" |
| 37 | +] |
| 38 | + |
| 39 | +# Helper functions |
| 40 | +def stripped(s): |
| 41 | + """Cleans string to remove quotes from its leading |
| 42 | + and trailing ends. |
| 43 | + @param s <str>: |
| 44 | + String to remove quotes or clean |
| 45 | + @return s <str>: |
| 46 | + Cleaned string with quotes removed |
| 47 | + """ |
| 48 | + return s.strip('"').strip("'").strip() |
| 49 | + |
| 50 | + |
| 51 | +def index_file(input_file, key, required_fields, optional_fields, delim=','): |
| 52 | + """Parses and indexes a file into a dictionary for quick |
| 53 | + lookups later. The file will be indexed as a nested dictionary |
| 54 | + where key is the first key and the second keys are the required |
| 55 | + and optional fields. |
| 56 | + For example, if the file, sample_sheet.csv, contains the following: |
| 57 | + sample,fastqs,cytaimage,slide,area,image,id |
| 58 | + A,/path/to/fastq1,cytaimage1,slide1,area1,, |
| 59 | + B,/path/to/fastq2,cytaimage2,slide2,area2,image2,IDB |
| 60 | + >>> index_file("sample_sheet.tsv", "sample", |
| 61 | + ["fastqs","cytaimage","slide","area"], |
| 62 | + ["image", "id"]) |
| 63 | + { |
| 64 | + "A": { |
| 65 | + "fastqs": "/path/to/fastq1", |
| 66 | + "cytaimage": "cytaimage1", |
| 67 | + "slide": "slide1", |
| 68 | + "area": "area1", |
| 69 | + "image": "", |
| 70 | + "id": "" |
| 71 | + }, |
| 72 | + "B": { |
| 73 | + "fastqs": "/path/to/fastq2", |
| 74 | + "cytaimage": "cytaimage2", |
| 75 | + "slide": "slide2", |
| 76 | + "area": "area2", |
| 77 | + "image": "image2", |
| 78 | + "id": "IDB" |
| 79 | + } |
| 80 | + } |
| 81 | + @param input_file <str>: |
| 82 | + File to parse and index. Must contain a header with |
| 83 | + the columns listed in required_fields. The index of |
| 84 | + these columns will be automatically resolved. |
| 85 | + @param key <str>: |
| 86 | + Column name of the first key to index the file by. |
| 87 | + @param required_fields <list[str]>: |
| 88 | + List of required column names that will be used as |
| 89 | + the second key to index the file. The values of these |
| 90 | + columns will be stored in a nested dictionary. |
| 91 | + @param optional_fields <list[str]>: |
| 92 | + List of optional column names that will be used as |
| 93 | + the second key to index the file. The values of these |
| 94 | + columns will be stored in a nested dictionary. If a |
| 95 | + column is not present in the file, it will be set to |
| 96 | + the value provided in the dictionary. |
| 97 | + @param delim <str>: |
| 98 | + Delimiter used to separate columns in the file. |
| 99 | + Default is a comma (','). |
| 100 | + @return file_idx <dict[key][required_fields|optional_fields]=str>: |
| 101 | + Nested dictionary where, |
| 102 | + • key = 'key' column value |
| 103 | + • value = {required_field_col: "A", optional_field_col: "B"} |
| 104 | + Given, |
| 105 | + key="A", required_fields=["C","D"] |
| 106 | + returns {"A": {"C": "c_i", "D": "d_i"}} |
| 107 | + """ |
| 108 | + errors = False # Used to track errors |
| 109 | + file_idx = {} # Nested dictionary with parsed file |
| 110 | + line_number = 0 # Used for error reporting |
| 111 | + with open(input_file, newline='') as fh: |
| 112 | + # Skip empty lines and comments |
| 113 | + file = csv.DictReader( |
| 114 | + (line for line in fh if line.strip() and not line.lstrip().startswith("#")), |
| 115 | + delimiter=delim |
| 116 | + ) |
| 117 | + for parsed_line in file: |
| 118 | + line_number += 1 |
| 119 | + # Add first key to file_idx |
| 120 | + _k1 = stripped(parsed_line[key]) |
| 121 | + if _k1 not in file_idx: |
| 122 | + file_idx[_k1] = {} |
| 123 | + # Check for required fields |
| 124 | + for field in required_fields: |
| 125 | + value = stripped(parsed_line.get(field, '')) |
| 126 | + if field not in parsed_line or not value: |
| 127 | + # Missing required field from header |
| 128 | + err( |
| 129 | + "Error: Missing required field '{}' in line {} of file '{}'!".format( |
| 130 | + field, line_number, input_file |
| 131 | + ) |
| 132 | + ) |
| 133 | + errors = True |
| 134 | + continue # goto next field |
| 135 | + # Add required field to file_idx |
| 136 | + file_idx[_k1][field] = stripped(parsed_line[field]) |
| 137 | + # Check for optional fields |
| 138 | + for field in optional_fields: |
| 139 | + value = stripped(parsed_line.get(field, '')) |
| 140 | + if field not in parsed_line or not value: |
| 141 | + # Missing optional field from header, |
| 142 | + # or empty value |
| 143 | + value = '' |
| 144 | + # Add optional field to file_idx |
| 145 | + file_idx[_k1][field] = value |
| 146 | + # Check for errors |
| 147 | + if errors: |
| 148 | + fatal( |
| 149 | + "Fatal: Errors were found while parsing file '{}'! Please fix the errors and try again.".format(input_file) |
| 150 | + ) |
| 151 | + return file_idx |
| 152 | + |
| 153 | + |
| 154 | +def sample_sheet( |
| 155 | + file, |
| 156 | + required_fields=REQUIRED_SAMPLE_SHEET_COLUMNS, |
| 157 | + optional_fields=OPTIONAL_SAMPLE_SHEET_COLUMNS, |
| 158 | + remap_missing_fields={"id": "sample"} |
| 159 | + ): |
| 160 | + """Parses a sample sheet file and returns an indexed dictionary. |
| 161 | + The sample sheet must contain a header with the required fields. |
| 162 | + @param file <str>: |
| 163 | + Path to the sample sheet file to parse and index. This can be a |
| 164 | + .tsv, .txt, or .csv file. The file must contain a header with |
| 165 | + the required fields. |
| 166 | + @param required_fields <list[str]>: |
| 167 | + List of required field names that must be present in the header. |
| 168 | + @param optional_fields <list[str]>: |
| 169 | + List of optional field names that can be present in the header. |
| 170 | + @param remap_missing_fields <dict[str]=str>: |
| 171 | + Dictionary to remap missing fields to required fields. |
| 172 | + For example, if the sample sheet does not contain an 'id' field, |
| 173 | + it can be remapped to the 'sample' field which will always be |
| 174 | + present. |
| 175 | + """ |
| 176 | + if file.endswith('.tsv') or file.endswith('.txt'): |
| 177 | + # Use tab as delimiter for TSV files |
| 178 | + delim = '\t' |
| 179 | + elif file.endswith('.csv'): |
| 180 | + # Use comma as delimiter for CSV files |
| 181 | + delim = ',' |
| 182 | + else: |
| 183 | + # Unsupported file type, not sure what the |
| 184 | + # delimiter is here or what the user is trying |
| 185 | + # to do, so we will raise an error. |
| 186 | + fatal( |
| 187 | + "Error: Unsupported file type for sample sheet '{}'. " |
| 188 | + "Fatal: Please provide a .tsv (tab-seperated) or .csv (comma-seperated) file.".format(file) |
| 189 | + ) |
| 190 | + # Parse and index the sample sheet |
| 191 | + parsed_file = index_file( |
| 192 | + file, |
| 193 | + "sample", |
| 194 | + required_fields, |
| 195 | + optional_fields, |
| 196 | + delim=delim |
| 197 | + ) |
| 198 | + # Remap missing fields to a required field |
| 199 | + for sample, metadata in parsed_file.items(): |
| 200 | + for field, remap_field in remap_missing_fields.items(): |
| 201 | + if field not in metadata or not metadata[field]: |
| 202 | + # If the field is missing or empty, remap it |
| 203 | + # to a known required field. We are using this |
| 204 | + # to remap the 'id' field to the 'sample' field |
| 205 | + # if the 'id' field was not provided or it is |
| 206 | + # set to an empty string/value. |
| 207 | + metadata[field] = metadata.get(remap_field, '') |
| 208 | + return parsed_file |
| 209 | + |
| 210 | + |
| 211 | +if __name__ == "__main__": |
| 212 | + # Testing sample sheet parser |
| 213 | + input_sample_sheet = sys.argv[1] # supports .tsv, .txt, .csv |
| 214 | + parsed_file = sample_sheet( |
| 215 | + input_sample_sheet |
| 216 | + ) |
| 217 | + # Print nest dictionary with |
| 218 | + # parsed sample sheet values |
| 219 | + print(parsed_file) |
0 commit comments