Skip to content
Open
132 changes: 132 additions & 0 deletions poretools/dataconc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import sys
import Fast5File
import numpy as np
import pandas
import matplotlib.pyplot as plt

#logging
import logging
logger = logging.getLogger('poretools')
logger.setLevel(logging.INFO)

def plot_data_conc(sizes, args):
"""
Use matplotlib to plot a bar chart of total data (bp) vs. read sizes
"""

binwidth = args.bin_width
reads = pandas.DataFrame(data=dict(lengths=sizes))
high = max(reads['lengths'])
if args.percent:
scale = 100/float(sum(reads['lengths']))
else:
scale = 1
# plot - taking a polygon approach
# for each bin, get sum of read lengths that fall within that bin region
# and plot bar/rectangle of that height over the bin region
height = 0
for i in range(0,high,binwidth):
if args.cumulative:
height += scale*sum(reads['lengths'][reads['lengths'] > i][reads['lengths'] <= i+binwidth])
else:
height = scale*sum(reads['lengths'][reads['lengths'] > i][reads['lengths'] <= i+binwidth])
x = [i, i, i+binwidth,i+binwidth, i] # define x-dims of bar/rectangle (left, left, right, right, left)
y = [0,height,height,0,0] # define y-dims of bar/rectangle (bottom, top, top, bottom, bottom).
plt.fill(x, y, 'r') #plot bar/rectangle (color is red for now)
if args.cumulative:
plt.title("Cumulative Data Concentration Plot")
else:
plt.title("Data Concentration Plot")
plt.xlabel("Read length (bp)")
if args.percent:
plt.ylabel("Percent of Data")
else:
plt.ylabel("Data (bp)")

# saving or plotting to screen (user can save it from there as well)
if args.saveas is not None:
plot_file = args.saveas
if plot_file.endswith(".pdf") or plot_file.endswith(".jpg"):
plt.savefig(plot_file)
else:
logger.error("Unrecognized extension for %s! Try .pdf or .jpg" % (plot_file))
sys.exit()

else:
plt.show()


def run(parser, args):
if args.simulate:
if args.parameters: ##fastest method is to just supply all paramaters
parameters = args.parameters.split(",")
readcount = int(parameters[0])
minsize = int(parameters[1])
maxsize = int(parameters[2])
else: ## find what was not provided out of: N, maxsize, defaultsize
readcount = 0
files_processed = 0
minsize = float('inf')
maxsize = 0
for fast5 in Fast5File.Fast5FileSet(args.files):
if args.high_quality:
if fast5.get_complement_events_count() <= \
fast5.get_template_events_count():
fast5.close()
continue
if args.single_read:
fqs = [fast5.get_fastq()]
else:
fqs = fast5.get_fastqs(args.type)
for fq in fqs:
if fq is not None:
readcount += 1
size = len(fq.seq)
if size > maxsize:
maxsize = size
elif size < minsize:
minsize = size
files_processed += 1
if files_processed % 100 == 0:
logger.info("%d files processed." % files_processed)
fast5.close()
if args.min_length:
minsize = int(args.min_length)
if args.max_length:
maxsize = int(args.max_length)
sizes = list(np.random.random_integers(minsize, maxsize, readcount))
logger.info("parameters: N=%d, minsize=%d, maxsize=%d" % (readcount, minsize, maxsize))

else:
sizes = []
files_processed = 0
for fast5 in Fast5File.Fast5FileSet(args.files):
if args.start_time or args.end_time:
read_start_time = fast5.get_start_time()
read_end_time = fast5.get_end_time()
if args.start_time and args.start_time > read_start_time:
fast5.close()
continue
if args.end_time and args.end_time < read_end_time:
fast5.close()
continue
if args.high_quality:
if fast5.get_complement_events_count() <= \
fast5.get_template_events_count():
fast5.close()
continue
if args.single_read:
fqs = [fast5.get_fastq()]
else:
fqs = fast5.get_fastqs(args.type)
for fq in fqs:
if fq is not None and not (len(fq.seq) < args.min_length or len(fq.seq) > args.max_length):
sizes.append(len(fq.seq))
files_processed += 1
if files_processed % 100 == 0:
logger.info("%d files processed." % files_processed)
fast5.close()

plot_data_conc(sizes, args)


86 changes: 86 additions & 0 deletions poretools/poretools_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def run_subtool(parser, args):
import fastq as submodule
elif args.command == 'hist':
import hist as submodule
elif args.command == 'data_conc':
import dataconc as submodule
elif args.command == 'nucdist':
import nucdist as submodule
elif args.command == 'occupancy':
Expand Down Expand Up @@ -218,6 +220,89 @@ def main():
parser_hist.set_defaults(func=run_subtool)


##########
# data_conc (data concentration plot)
##########
parser_dataconc = subparsers.add_parser('data_conc',
help='''Plot sum of read lengths in each bin for a given set of bins for a set of FAST5 files.
This is the type of plot seen in MinKNOW while sequencing.''')
parser_dataconc.add_argument('files', metavar='FILES', nargs='+',
help='The input FAST5 files.')
parser_dataconc.add_argument('--min-length',
dest='min_length',
default=0,
type=int,
help=('Minimum read length to be included in analysis.'))
parser_dataconc.add_argument('--max-length',
dest='max_length',
default=1000000000,
type=int,
help=('Maximum read length to be included in analysis.'))
parser_dataconc.add_argument('--bin-width',
dest='bin_width',
default=500,
type=int,
help=('The width of bins (default: 500 bp).'))
parser_dataconc.add_argument('--saveas',
dest='saveas',
metavar='STRING',
help='''Save the plot to a file named filename.extension (e.g. pdf, jpg)''',
default=None)
parser_dataconc.add_argument('--cumulative',
action="store_true",
help='''For cumulative plot.''',
default=False)
parser_dataconc.add_argument('--percent',
action="store_true",
help='''Plot as percentge of all data.''',
default=False)
parser_dataconc.add_argument('--simulate',
action="store_true",
help='''This will randomly sample N read lengths in the size range from min to max (or according parameters set by --parameters),
where N is the number of reads in the fast5 dir (or N specified with --parameters).
Then it will plot the simulation lengths. INFO about parameters used is printed so that
it can be reproduced with --parameters in the future (much faster).''',
default=False)
parser_dataconc.add_argument('--parameters',
type=str,
help='''--simulate by default will use N=readcount, range=min-to-max. Override this with --parameters N,min,max. e.g. --parameters 350,500,48502''',
default=False)

parser_dataconc.add_argument('--start',
dest='start_time',
default=None,
type=int,
help='Only analyze reads from after start timestamp')
parser_dataconc.add_argument('--end',
dest='end_time',
default=None,
type=int,
help='Only analyze reads from before end timestamp')
parser_dataconc.add_argument('--high-quality',
dest='high_quality',
default=False,
action='store_true',
help='Only analyze reads with more complement events than template. Can be used with --type or --one-read-per-molecule to select a specific read type from high quality reads.')
parser_dataconc_readfilter = parser_dataconc.add_mutually_exclusive_group()
parser_dataconc_readfilter.add_argument('--type',
dest='type',
metavar='STRING',
choices=['all', 'fwd', 'rev', '2D', 'fwd,rev'],
default='all',
help='Which type of reads should be analyzed? Def.=all, choices=[all, fwd, rev, 2D, fwd,rev]. Is mutually exclusive with --one-read-per-molecule.')
parser_dataconc_readfilter.add_argument('-1', '--one-read-per-molecule',
dest='single_read',
default=False,
action='store_true',
help='''Only analyze one read per molecule in priority order: 2D -> template -> complement.
That is, if there is a 2D read use that.If not, then try to use template. etc.
Is mutually exclusive with --type.''')
parser_dataconc.set_defaults(func=run_subtool)

parser_dataconc.set_defaults(func=run_subtool)



###########
# events
###########
Expand Down Expand Up @@ -274,6 +359,7 @@ def main():
parser_qualdist.set_defaults(func=run_subtool)



##########
# winner
##########
Expand Down