211211# ###################### 0b. SETTING UP #######################
212212# #############################################################
213213
214+ echo " Welcome to Stargraph; lets get things set up"
215+
214216# #redefining some variables and checking
215217
216218# #paths to raw data
@@ -247,6 +249,9 @@ flank2=$( echo ${flank} | awk '{print $1/1000}' )
247249# ################ STEP 1: CREATING THE GENOME GRAPH #################
248250# ####################################################################
249251
252+ echo " Paths look good; lets begin"
253+ echo " Step 1: Building the genome-graph"
254+
250255
251256# #calculate the minimum nucleotide identity between all assemblies provided
252257# #this will made slightly more lentient (adding 5% more divergence) and provided as the -p option in pggb (if not provided manually)
@@ -287,6 +292,7 @@ cd 1.${prefix}.pggb
287292# ################ STEP 2: IDENTIFYING PRESENCE/ABSENCE VARIANT #################
288293# ###############################################################################
289294
295+ echo " Step 2: Identifying Presence-Absence variants"
290296
291297# #the simpliest way to do so was to use the odgi presence-absence 'PAV' function
292298# #it can produce a matrix of whether regions are present or absent in for each strain
@@ -353,6 +359,7 @@ cd ..
353359# ################ STEP 3a: IDENTIFYING STARSHIP-LIKE REGIONS ###################
354360# ###############################################################################
355361
362+ echo " Step 3a: Elevating PAVs to Starship-like regions"
356363
357364mkdir 2.PAVs_to_SLRs
358365cd 2.PAVs_to_SLRs
@@ -431,6 +438,9 @@ cd ..
431438# ################ STEP 3b: GENERATING SLR PLOTS ###################
432439# ##################################################################
433440
441+ echo " Step 3b: Clustering SLRs; aligning clusters and generating plots"
442+
443+
434444mkdir 3.SLR_plots
435445cd 3.SLR_plots
436446
597607echo " contig;start;end;SLR" | tr ' ;' ' \t' > ${cluster} .SLRs.bed
598608cat ../2.PAVs_to_SLRs/${prefix} .SLRs.plus_clusters.tsv | awk -v cluster=" ${cluster} " ' {if($5 == cluster) {print $2"\t"$3"\t"$4"\t"$1}}' >> ${cluster} .SLRs.bed
599609
600-
601-
602610# #add the contigs missing the elements to the complete contigs file
603611cat ${cluster} .absent.contigs.fa >> ${cluster} .contigs.fa
604612
@@ -615,18 +623,27 @@ Rscriptpath=$( which gggenomes_skeleton.stargraph.R )
615623cat ${Rscriptpath} | sed " s/CLUSTER/${cluster} /g" | sed " s|PATHTOOUTPUT|${outputpath} /3.SLR_plots|g" > ${cluster} .R
616624Rscript ${cluster} .R
617625
626+ # #remove some of the fasta files
627+ rm ${cluster} .contigs.fa
628+ rm ${cluster} .absent.*
629+ rm ${cluster} .contigs.nucmer.delta
630+
618631done
619632
620633# #cleanup the default printing of Rplots
621634rm Rplots.pdf
622635
636+
623637cd ..
624638
625639# #########################################################################
626640# ################ STEP 4: COMBINING SLRs AND STARSHIPS ###################
627641# #########################################################################
628642
629643
644+ echo " Step 4: Combining SLRs and Starships into a non-redundant dataset"
645+
646+
630647# #we now have out SLR dataset and can combine this with the Starship input from starfish
631648# #in doing so we actually do three simple steps
632649# #Step 1: subtract all the SLR regions that overlap with a starfish region
@@ -696,6 +713,9 @@ cd ..
696713# ######################################################################################
697714
698715
716+ echo " Step 5: Generating network plots and further clustering/alignments using the non-redundant dataset"
717+
718+
699719# #we now have our Starship-SLR dataset we can see what this combined dataset looks like
700720# #we can do this in terms of kmer similiarity initially
701721# #this allows us to do two thing; build networks of the similarities AND cluster Starships and SLRs and align them to one another
@@ -712,10 +732,11 @@ awk -F " " '{print $1}' ../4.SLR_starship_combination/${prefix}.starships_SLRs.f
712732sourmash sketch dna -p k=31,noabund --singleton -o sourmash_signatures/ temp.fa
713733# #first we can get the jaccard similarity
714734sourmash compare -k 31 sourmash_signatures/* .sig.gz --csv sourmash_signatures.compare_k31.jaccard.csv
715-
716735# #and second we can get the max pairwise containment score
717736sourmash compare -k 31 sourmash_signatures/* .sig.gz --max-containment --csv sourmash_signatures.compare_k31.containment.csv
718737
738+ rm temp.fa
739+
719740# #need a very low threshold to remove all very small similarities, here using 10% jaccard similarity
720741threshold=" 10"
721742# #generate header for the file that will be used to build the network
@@ -751,6 +772,22 @@ Rscript network_plotting.R
751772
752773
753774
775+ # #cluster all the Starships and SLRs using the same kmer similarities (containment)
776+ # #now use mcl to quickly find the clusters
777+ mcl ${prefix} .starships_SLRs.pairwise_containment.tsv --abc -o ${prefix} .starships_SLRs.pairwise_containment.mcl.txt
778+ # #now name the clusters and then append to the summary files
779+ awk -F ' \t' ' {for (i=1; i <= NF; i++) {print "cluster"NR "\t" $i}}' ${prefix} .starships_SLRs.pairwise_containment.mcl.txt > ${prefix} .starships_SLRs.pairwise_containment.mcl.clusters.txt
780+
781+ echo " element;contig;start;end;cluster" | tr ' ;' ' \t' > ${prefix} .starships_SLRs.plus_clusters.tsv
782+ cat ${prefix} .SLRs.tsv | while read line
783+ do
784+ SLR=$( echo " ${line} " | awk ' {print $1}' )
785+ cat ${prefix} .starships_SLRs.pairwise_containment.mcl.clusters.txt | awk -v SLR=" $SLR " -v line=" $line " ' {if($2==SLR) print line"\t"$1}'
786+ done >> ${prefix} .starships_SLRs.plus_clusters.tsv
787+
788+ # #now use these clusters to generate plots
789+
790+
754791
755792cd ..
756793
0 commit comments