Skip to content

Commit 5e65fa3

Browse files
committed
Adding parser for pipeline's sample sheet
1 parent e28a7ca commit 5e65fa3

File tree

1 file changed

+219
-0
lines changed

1 file changed

+219
-0
lines changed

src/files.py

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: UTF-8 -*-
3+
4+
# Python standard library
5+
from __future__ import print_function
6+
import csv, sys
7+
8+
# Local imports
9+
from utils import (
10+
Colors,
11+
err,
12+
fatal
13+
)
14+
15+
# Constants
16+
_C = Colors()
17+
# Required sample sheet field names
18+
REQUIRED_SAMPLE_SHEET_COLUMNS = [
19+
"sample",
20+
"fastqs",
21+
"cytaimage",
22+
"slide",
23+
"area"
24+
]
25+
# Optional sample sheet field names,
26+
# with their default values, most
27+
# of which are None. The id field
28+
# is set to the required sample field
29+
# if it is provided.
30+
OPTIONAL_SAMPLE_SHEET_COLUMNS = [
31+
"id",
32+
"image",
33+
"darkimage",
34+
"colorizedimage",
35+
"loupe_alignment",
36+
"barcode_csv"
37+
]
38+
39+
# Helper functions
40+
def stripped(s):
41+
"""Cleans string to remove quotes from its leading
42+
and trailing ends.
43+
@param s <str>:
44+
String to remove quotes or clean
45+
@return s <str>:
46+
Cleaned string with quotes removed
47+
"""
48+
return s.strip('"').strip("'").strip()
49+
50+
51+
def index_file(input_file, key, required_fields, optional_fields, delim=','):
52+
"""Parses and indexes a file into a dictionary for quick
53+
lookups later. The file will be indexed as a nested dictionary
54+
where key is the first key and the second keys are the required
55+
and optional fields.
56+
For example, if the file, sample_sheet.csv, contains the following:
57+
sample,fastqs,cytaimage,slide,area,image,id
58+
A,/path/to/fastq1,cytaimage1,slide1,area1,,
59+
B,/path/to/fastq2,cytaimage2,slide2,area2,image2,IDB
60+
>>> index_file("sample_sheet.tsv", "sample",
61+
["fastqs","cytaimage","slide","area"],
62+
["image", "id"])
63+
{
64+
"A": {
65+
"fastqs": "/path/to/fastq1",
66+
"cytaimage": "cytaimage1",
67+
"slide": "slide1",
68+
"area": "area1",
69+
"image": "",
70+
"id": ""
71+
},
72+
"B": {
73+
"fastqs": "/path/to/fastq2",
74+
"cytaimage": "cytaimage2",
75+
"slide": "slide2",
76+
"area": "area2",
77+
"image": "image2",
78+
"id": "IDB"
79+
}
80+
}
81+
@param input_file <str>:
82+
File to parse and index. Must contain a header with
83+
the columns listed in required_fields. The index of
84+
these columns will be automatically resolved.
85+
@param key <str>:
86+
Column name of the first key to index the file by.
87+
@param required_fields <list[str]>:
88+
List of required column names that will be used as
89+
the second key to index the file. The values of these
90+
columns will be stored in a nested dictionary.
91+
@param optional_fields <list[str]>:
92+
List of optional column names that will be used as
93+
the second key to index the file. The values of these
94+
columns will be stored in a nested dictionary. If a
95+
column is not present in the file, it will be set to
96+
the value provided in the dictionary.
97+
@param delim <str>:
98+
Delimiter used to separate columns in the file.
99+
Default is a comma (',').
100+
@return file_idx <dict[key][required_fields|optional_fields]=str>:
101+
Nested dictionary where,
102+
• key = 'key' column value
103+
• value = {required_field_col: "A", optional_field_col: "B"}
104+
Given,
105+
key="A", required_fields=["C","D"]
106+
returns {"A": {"C": "c_i", "D": "d_i"}}
107+
"""
108+
errors = False # Used to track errors
109+
file_idx = {} # Nested dictionary with parsed file
110+
line_number = 0 # Used for error reporting
111+
with open(input_file, newline='') as fh:
112+
# Skip empty lines and comments
113+
file = csv.DictReader(
114+
(line for line in fh if line.strip() and not line.lstrip().startswith("#")),
115+
delimiter=delim
116+
)
117+
for parsed_line in file:
118+
line_number += 1
119+
# Add first key to file_idx
120+
_k1 = stripped(parsed_line[key])
121+
if _k1 not in file_idx:
122+
file_idx[_k1] = {}
123+
# Check for required fields
124+
for field in required_fields:
125+
value = stripped(parsed_line.get(field, ''))
126+
if field not in parsed_line or not value:
127+
# Missing required field from header
128+
err(
129+
"Error: Missing required field '{}' in line {} of file '{}'!".format(
130+
field, line_number, input_file
131+
)
132+
)
133+
errors = True
134+
continue # goto next field
135+
# Add required field to file_idx
136+
file_idx[_k1][field] = stripped(parsed_line[field])
137+
# Check for optional fields
138+
for field in optional_fields:
139+
value = stripped(parsed_line.get(field, ''))
140+
if field not in parsed_line or not value:
141+
# Missing optional field from header,
142+
# or empty value
143+
value = ''
144+
# Add optional field to file_idx
145+
file_idx[_k1][field] = value
146+
# Check for errors
147+
if errors:
148+
fatal(
149+
"Fatal: Errors were found while parsing file '{}'! Please fix the errors and try again.".format(input_file)
150+
)
151+
return file_idx
152+
153+
154+
def sample_sheet(
155+
file,
156+
required_fields=REQUIRED_SAMPLE_SHEET_COLUMNS,
157+
optional_fields=OPTIONAL_SAMPLE_SHEET_COLUMNS,
158+
remap_missing_fields={"id": "sample"}
159+
):
160+
"""Parses a sample sheet file and returns an indexed dictionary.
161+
The sample sheet must contain a header with the required fields.
162+
@param file <str>:
163+
Path to the sample sheet file to parse and index. This can be a
164+
.tsv, .txt, or .csv file. The file must contain a header with
165+
the required fields.
166+
@param required_fields <list[str]>:
167+
List of required field names that must be present in the header.
168+
@param optional_fields <list[str]>:
169+
List of optional field names that can be present in the header.
170+
@param remap_missing_fields <dict[str]=str>:
171+
Dictionary to remap missing fields to required fields.
172+
For example, if the sample sheet does not contain an 'id' field,
173+
it can be remapped to the 'sample' field which will always be
174+
present.
175+
"""
176+
if file.endswith('.tsv') or file.endswith('.txt'):
177+
# Use tab as delimiter for TSV files
178+
delim = '\t'
179+
elif file.endswith('.csv'):
180+
# Use comma as delimiter for CSV files
181+
delim = ','
182+
else:
183+
# Unsupported file type, not sure what the
184+
# delimiter is here or what the user is trying
185+
# to do, so we will raise an error.
186+
fatal(
187+
"Error: Unsupported file type for sample sheet '{}'. "
188+
"Fatal: Please provide a .tsv (tab-seperated) or .csv (comma-seperated) file.".format(file)
189+
)
190+
# Parse and index the sample sheet
191+
parsed_file = index_file(
192+
file,
193+
"sample",
194+
required_fields,
195+
optional_fields,
196+
delim=delim
197+
)
198+
# Remap missing fields to a required field
199+
for sample, metadata in parsed_file.items():
200+
for field, remap_field in remap_missing_fields.items():
201+
if field not in metadata or not metadata[field]:
202+
# If the field is missing or empty, remap it
203+
# to a known required field. We are using this
204+
# to remap the 'id' field to the 'sample' field
205+
# if the 'id' field was not provided or it is
206+
# set to an empty string/value.
207+
metadata[field] = metadata.get(remap_field, '')
208+
return parsed_file
209+
210+
211+
if __name__ == "__main__":
212+
# Testing sample sheet parser
213+
input_sample_sheet = sys.argv[1] # supports .tsv, .txt, .csv
214+
parsed_file = sample_sheet(
215+
input_sample_sheet
216+
)
217+
# Print nest dictionary with
218+
# parsed sample sheet values
219+
print(parsed_file)

0 commit comments

Comments
 (0)