11import os , sys
22
33import matplotlib .pyplot as plt
4+ import numpy as np
45
56
67from transposonmapper .statistics .volcano_helpers import apply_stats , info_from_datasets , make_datafile
78
8- def volcano (path_a , filelist_a , path_b , filelist_b , variable = 'read_per_gene' , significance_threshold = 0.01 , normalize = True , trackgene_list = [], figure_title = "" ):
9+ def volcano (path_a , filelist_a , path_b , filelist_b ,fold_change_interval ,p_value_interval ,variable = 'read_per_gene' ,
10+ significance_threshold = 0.01 , normalize = True , trackgene_list = [], figure_title = "" ,savefigure = False ):
911 """This script creates a volcanoplot to show the significance of fold change between two datasets.
1012 It is based on this website:
1113 - https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f
@@ -50,6 +52,16 @@ def volcano(path_a, filelist_a, path_b, filelist_b, variable='read_per_gene', si
5052 The format of the pergene file should be TAB separated and NOT COMMA separated. if you have
5153 it as comma separated you can convert to tab separated using the command line with
5254 this command: `cat oldfile.txt | tr '[,]' '[\t ]' > newfile.txt`
55+
56+ fold_change_interval : numpy.array
57+
58+ a vector of two elements where the first element(positive value) set the upper threshold for the positive
59+ values and the second element(negative value) set the lower threshold for the negative values.
60+
61+
62+ p_value_interval : numpy.array
63+ a vector of two elements where the first element set the upper threshold for the positive
64+ values and the second element set the lower threshold for the negative values.
5365 variable : str, optional
5466 tn_per_gene, read_per_gene or Nreadsperinsrt , by default 'read_per_gene'
5567 significance_threshold : float, optional
@@ -61,6 +73,7 @@ def volcano(path_a, filelist_a, path_b, filelist_b, variable='read_per_gene', si
6173 Enter a list of gene name(s) which will be highlighted in the plot (e.g. ['cdc42', 'nrp1']), by default []
6274 figure_title : str, optional
6375 The title of the figure if not empty, by default ""
76+ savefigure : bool, optional
6477
6578
6679 Returns
@@ -101,26 +114,29 @@ def volcano(path_a, filelist_a, path_b, filelist_b, variable='read_per_gene', si
101114 grid = plt .GridSpec (1 , 1 , wspace = 0.0 , hspace = 0.0 )
102115 ax = plt .subplot (grid [0 ,0 ])
103116
104- colors = {False :'black' , True :'red' } # based on p-value significance
105- sc = ax .scatter (x = volcano_df ['fold_change' ], y = volcano_df ['p_value' ], alpha = 0.4 , marker = '.' , c = volcano_df ['significance' ].apply (lambda x :colors [x ]))
117+ colors = {False :'gray' , True :'red' } # based on p-value significance
118+ sc = ax .scatter (x = volcano_df ['fold_change' ], y = volcano_df ['p_value' ], alpha = 0.4 ,
119+ s = 100 , c = volcano_df ['significance' ].apply (lambda x :colors [x ]),label = variable )
106120 ax .grid (True , which = 'major' , axis = 'both' , alpha = 0.4 )
107- ax .set_xlabel ('Log2 FC' )
108- ax .set_ylabel ('-1*Log10 p-value' )
121+ ax .set_xlabel ('Log2 FC' ,fontsize = 16 )
122+ ax .set_ylabel ('-*Log10 p-value' ,fontsize = 16 )
123+ ax .tick_params (axis = 'both' , which = 'major' , labelsize = 14 )
124+ ax .legend (loc = 'best' ,fontsize = 16 )
109125 if not figure_title == "" :
110- ax .set_title (variable + " - " + figure_title )
126+ ax .set_title (figure_title , fontsize = 20 )
111127 else :
112- ax .set_title (variable )
113- ax .scatter (x = [],y = [],marker = '.' ,color = 'black' , label = 'p-value > {}' .format (significance_threshold )) #set empty scatterplot for legend
114- ax .scatter (x = [],y = [],marker = '.' ,color = 'red' , label = 'p-value < {}' .format (significance_threshold )) #set empty scatterplot for legend
115- ax .legend ()
128+ ax .set_title (variable , fontsize = 20 )
129+ # ax.scatter(x=[],y=[],marker='.',color='black', label='p-value > {}'.format(significance_threshold)) #set empty scatterplot for legend
130+ # ax.scatter(x=[],y=[],marker='.',color='red', label='p-value < {}'.format(significance_threshold)) #set empty scatterplot for legend
131+ # ax.legend()
116132 if not trackgene_list == []:
117133 genenames_array = volcano_df ['gene_names' ].to_numpy ()
118134 for trackgene in trackgene_list :
119135 trackgene = trackgene .upper ()
120136 if trackgene in genenames_array :
121137 trackgene_index = tnread_gene_a .loc [tnread_gene_a ['gene_names' ] == trackgene ].index [0 ]
122138 trackgene_annot = ax .annotate (volcano_df .iloc [trackgene_index ,:]['gene_names' ], (volcano_df .iloc [trackgene_index ,:]['fold_change' ], volcano_df .iloc [trackgene_index ,:]['p_value' ]),
123- size = 10 , c = 'green' , bbox = dict (boxstyle = "round" , fc = "w" ))
139+ size = 12 , c = 'green' , bbox = dict (boxstyle = "round" , fc = "w" ))
124140 trackgene_annot .get_bbox_patch ().set_alpha (0.6 )
125141 else :
126142 print ('WARNING: %s not found' % trackgene )
@@ -133,6 +149,36 @@ def volcano(path_a, filelist_a, path_b, filelist_b, variable='read_per_gene', si
133149 arrowprops = dict (arrowstyle = "->" ))
134150 annot .set_visible (False )
135151
152+ ## Annotate based on fold change and p values
153+ if not fold_change_interval == [] or not p_value_interval == []:
154+ target = volcano_df [(volcano_df ["fold_change" ]> fold_change_interval [0 ]) & (volcano_df ["p_value" ]> p_value_interval [0 ])]["gene_names" ]
155+ target_left = volcano_df [(volcano_df ["fold_change" ]< fold_change_interval [1 ]) & (volcano_df ["p_value" ]> p_value_interval [1 ])]["gene_names" ]
156+
157+ if len (target )!= 0 :
158+ for i in np .arange (0 ,len (target )):
159+
160+ index = np .where (volcano_df == target .iloc [i ])[0 ][0 ]
161+
162+
163+
164+ trackgene_annot = ax .annotate (volcano_df .iloc [index ,:]['gene_names' ], (volcano_df .iloc [index ,:]['fold_change' ],
165+ volcano_df .iloc [index ,:]['p_value' ]),
166+ size = 12 , c = 'green' , bbox = dict (boxstyle = "round" , fc = "w" ))
167+ trackgene_annot .get_bbox_patch ().set_alpha (0.6 )
168+
169+ if len (target_left )!= 0 :
170+ for i in np .arange (0 ,len (target_left )):
171+
172+ index = np .where (volcano_df == target_left .iloc [i ])[0 ][0 ]
173+
174+
175+
176+ trackgene_annot = ax .annotate (volcano_df .iloc [index ,:]['gene_names' ], (volcano_df .iloc [index ,:]['fold_change' ],
177+ volcano_df .iloc [index ,:]['p_value' ]),
178+ size = 12 , c = 'green' , bbox = dict (boxstyle = "round" , fc = "w" ))
179+ trackgene_annot .get_bbox_patch ().set_alpha (0.6 )
180+
181+
136182
137183
138184 def update_annot (ind ):
@@ -162,6 +208,9 @@ def hover(event):
162208
163209 fig .canvas .mpl_connect ("motion_notify_event" , hover )
164210
211+ if savefigure :
212+ fig .savefig ("volcano_" + filelist_a [0 ].split ("_" )[0 ]+ "vs" + filelist_b [0 ].split ("_" )[0 ]+ ".png" , dpi = 400 )
213+
165214
166215## return function
167216 return (volcano_df )
0 commit comments