arq5x · JohnUrban · Oct 14, 2014 · Oct 14, 2014 · Oct 14, 2014 · Oct 15, 2014
diff --git a/poretools/dataconc.py b/poretools/dataconc.py
@@ -0,0 +1,132 @@
+import sys
+import Fast5File
+import numpy as np
+import pandas
+import matplotlib.pyplot as plt
+
+#logging
+import logging
+logger = logging.getLogger('poretools')
+logger.setLevel(logging.INFO)
+
+def plot_data_conc(sizes, args):
+	"""
+	Use matplotlib to plot a bar chart of total data (bp) vs. read sizes
+	"""
+
+	binwidth = args.bin_width
+	reads = pandas.DataFrame(data=dict(lengths=sizes))
+        high = max(reads['lengths'])
+        if args.percent:
+                scale = 100/float(sum(reads['lengths']))
+        else:
+                scale = 1
+        # plot - taking a polygon approach
+        # for each bin, get sum of read lengths that fall within that bin region
+        # and plot bar/rectangle of that height over the bin region
+        height = 0
+        for i in range(0,high,binwidth):
+                if args.cumulative:
+                        height += scale*sum(reads['lengths'][reads['lengths'] > i][reads['lengths'] <= i+binwidth])
+                else:
+                        height = scale*sum(reads['lengths'][reads['lengths'] > i][reads['lengths'] <= i+binwidth])
+                x = [i, i, i+binwidth,i+binwidth, i] # define x-dims of bar/rectangle (left, left, right, right, left)
+                y = [0,height,height,0,0] # define y-dims of bar/rectangle (bottom, top, top, bottom, bottom).
+                plt.fill(x, y, 'r') #plot bar/rectangle (color is red for now)
+        if args.cumulative:
+                plt.title("Cumulative Data Concentration Plot")
+        else:
+                plt.title("Data Concentration Plot")
+        plt.xlabel("Read length (bp)")
+        if args.percent:
+                plt.ylabel("Percent of Data")	
+        else:
+                plt.ylabel("Data (bp)")	    	
+
+        # saving or plotting to screen (user can save it from there as well)
+	if args.saveas is not None:
+		plot_file = args.saveas
+		if plot_file.endswith(".pdf") or plot_file.endswith(".jpg"):
+			plt.savefig(plot_file)
+		else:
+			logger.error("Unrecognized extension for %s! Try .pdf or .jpg" % (plot_file))
+			sys.exit()
+
+	else:
+		plt.show()
+
+
+def run(parser, args):
+        if args.simulate:
+                if args.parameters: ##fastest method is to just supply all paramaters
+                        parameters = args.parameters.split(",")
+                        readcount = int(parameters[0])
+                        minsize = int(parameters[1])
+                        maxsize = int(parameters[2])
+                else:   ## find what was not provided out of: N, maxsize, defaultsize 
+                        readcount = 0
+                        files_processed = 0
+                        minsize = float('inf')
+                        maxsize = 0
+                        for fast5 in Fast5File.Fast5FileSet(args.files):
+                                if args.high_quality:
+                                        if fast5.get_complement_events_count() <= \
+                                           fast5.get_template_events_count():
+                                                fast5.close()
+                                                continue
+                                if args.single_read:
+                                        fqs = [fast5.get_fastq()]
+                                else:
+                                        fqs = fast5.get_fastqs(args.type)
+                                for fq in fqs:
+                                        if fq is not None:
+                                                readcount += 1
+                                                size = len(fq.seq)
+                                                if size > maxsize:
+                                                        maxsize = size
+                                                elif size < minsize:
+                                                        minsize = size
+                                files_processed += 1
+                                if files_processed % 100 == 0:
+                                        logger.info("%d files processed." % files_processed)
+                                fast5.close()
+                                if args.min_length:
+                                        minsize = int(args.min_length)
+                                if args.max_length:
+                                        maxsize = int(args.max_length)
+                sizes = list(np.random.random_integers(minsize, maxsize, readcount))
+                logger.info("parameters: N=%d, minsize=%d, maxsize=%d" % (readcount, minsize, maxsize))
+
+        else:
+                sizes = []
+                files_processed = 0
+                for fast5 in Fast5File.Fast5FileSet(args.files):
+                        if args.start_time or args.end_time:
+                                read_start_time = fast5.get_start_time()
+                                read_end_time = fast5.get_end_time()
+                                if args.start_time and args.start_time > read_start_time:
+                                        fast5.close()
+                                        continue
+                                if args.end_time and args.end_time < read_end_time:
+                                        fast5.close()
+                                        continue
+                        if args.high_quality:
+                                if fast5.get_complement_events_count() <= \
+                                   fast5.get_template_events_count():
+                                        fast5.close()
+                                        continue
+                        if args.single_read:
+                                fqs = [fast5.get_fastq()]
+                        else:
+                                fqs = fast5.get_fastqs(args.type)
+                        for fq in fqs:
+                                if fq is not None and not (len(fq.seq) < args.min_length or len(fq.seq) > args.max_length):
+                                        sizes.append(len(fq.seq))
+                                files_processed += 1
+                        if files_processed % 100 == 0:
+                                logger.info("%d files processed." % files_processed)
+                        fast5.close()
+
+	plot_data_conc(sizes, args)
+
+
diff --git a/poretools/poretools_main.py b/poretools/poretools_main.py
@@ -22,6 +22,8 @@ def run_subtool(parser, args):
         import fastq as submodule
     elif args.command == 'hist':
         import hist as submodule
+    elif args.command == 'data_conc':
+        import dataconc as submodule
     elif args.command == 'nucdist':
         import nucdist as submodule
     elif args.command == 'occupancy':
@@ -218,6 +220,89 @@ def main():
     parser_hist.set_defaults(func=run_subtool)
 
 
+    ##########
+    # data_conc (data concentration plot)
+    ##########
+    parser_dataconc = subparsers.add_parser('data_conc',
+                                        help='''Plot sum of read lengths in each bin for a given set of bins for a set of FAST5 files.
+                                        This is the type of plot seen in MinKNOW while sequencing.''')
+    parser_dataconc.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_dataconc.add_argument('--min-length',
+                              dest='min_length',
+                              default=0,
+                              type=int,
+                              help=('Minimum read length to be included in analysis.'))
+    parser_dataconc.add_argument('--max-length',
+                              dest='max_length',
+                              default=1000000000,
+                              type=int,
+                              help=('Maximum read length to be included in analysis.'))
+    parser_dataconc.add_argument('--bin-width',
+                              dest='bin_width',
+                              default=500,
+                              type=int,
+                              help=('The width of bins (default: 500 bp).'))
+    parser_dataconc.add_argument('--saveas',
+                             dest='saveas',
+                             metavar='STRING',
+                             help='''Save the plot to a file named filename.extension (e.g. pdf, jpg)''',
+                             default=None)
+    parser_dataconc.add_argument('--cumulative',
+                                action="store_true",
+                             help='''For cumulative plot.''',
+                             default=False)
+    parser_dataconc.add_argument('--percent',
+                             action="store_true",
+                             help='''Plot as percentge of all data.''',
+                             default=False)
+    parser_dataconc.add_argument('--simulate',
+                             action="store_true",
+                             help='''This will randomly sample N read lengths in the size range from min to max (or according parameters set by --parameters),
+                                    where N is the number of reads in the fast5 dir (or N specified with --parameters). 
+                                    Then it will plot the simulation lengths. INFO about parameters used is printed so that
+                                    it can be reproduced with --parameters in the future (much faster).''',
+                             default=False)
+    parser_dataconc.add_argument('--parameters',
+                             type=str,
+                             help='''--simulate by default will use N=readcount, range=min-to-max. Override this with --parameters N,min,max. e.g. --parameters 350,500,48502''',
+                             default=False)
+
+    parser_dataconc.add_argument('--start',
+                              dest='start_time',
+                              default=None,
+                              type=int,
+                              help='Only analyze reads from after start timestamp')
+    parser_dataconc.add_argument('--end',
+                              dest='end_time',
+                              default=None,
+                              type=int,
+                              help='Only analyze reads from before end timestamp')
+    parser_dataconc.add_argument('--high-quality',
+                              dest='high_quality',
+                              default=False,
+                              action='store_true',
+                              help='Only analyze reads with more complement events than template. Can be used with --type or --one-read-per-molecule to select a specific read type from high quality reads.')
+    parser_dataconc_readfilter = parser_dataconc.add_mutually_exclusive_group()
+    parser_dataconc_readfilter.add_argument('--type',
+                              dest='type',
+                              metavar='STRING',
+                              choices=['all', 'fwd', 'rev', '2D', 'fwd,rev'],
+                              default='all',
+                              help='Which type of reads should be analyzed? Def.=all, choices=[all, fwd, rev, 2D, fwd,rev]. Is mutually exclusive with --one-read-per-molecule.')
+    parser_dataconc_readfilter.add_argument('-1', '--one-read-per-molecule',
+                              dest='single_read',
+                              default=False,
+                              action='store_true',
+                              help='''Only analyze one read per molecule in priority order: 2D -> template -> complement.
+                                            That is, if there is a 2D read use that.If not, then try to use template. etc.
+                                            Is mutually exclusive with --type.''')
+    parser_dataconc.set_defaults(func=run_subtool)
+
+    parser_dataconc.set_defaults(func=run_subtool)
+
+
+
     ###########
     # events
     ###########
@@ -274,6 +359,7 @@ def main():
     parser_qualdist.set_defaults(func=run_subtool)
 
 
+
     ##########
     # winner
     ##########