Skip to content

Commit f96f1ae

Browse files
authored
Update stargraph.sh
1 parent 3728e46 commit f96f1ae

File tree

1 file changed

+40
-3
lines changed

1 file changed

+40
-3
lines changed

bin/stargraph.sh

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ done
211211
####################### 0b. SETTING UP #######################
212212
##############################################################
213213

214+
echo "Welcome to Stargraph; lets get things set up"
215+
214216
##redefining some variables and checking
215217

216218
##paths to raw data
@@ -247,6 +249,9 @@ flank2=$( echo ${flank} | awk '{print $1/1000}' )
247249
################# STEP 1: CREATING THE GENOME GRAPH #################
248250
#####################################################################
249251

252+
echo "Paths look good; lets begin"
253+
echo "Step 1: Building the genome-graph"
254+
250255

251256
##calculate the minimum nucleotide identity between all assemblies provided
252257
##this will made slightly more lentient (adding 5% more divergence) and provided as the -p option in pggb (if not provided manually)
@@ -287,6 +292,7 @@ cd 1.${prefix}.pggb
287292
################# STEP 2: IDENTIFYING PRESENCE/ABSENCE VARIANT #################
288293
################################################################################
289294

295+
echo "Step 2: Identifying Presence-Absence variants"
290296

291297
##the simpliest way to do so was to use the odgi presence-absence 'PAV' function
292298
##it can produce a matrix of whether regions are present or absent in for each strain
@@ -353,6 +359,7 @@ cd ..
353359
################# STEP 3a: IDENTIFYING STARSHIP-LIKE REGIONS ###################
354360
################################################################################
355361

362+
echo "Step 3a: Elevating PAVs to Starship-like regions"
356363

357364
mkdir 2.PAVs_to_SLRs
358365
cd 2.PAVs_to_SLRs
@@ -431,6 +438,9 @@ cd ..
431438
################# STEP 3b: GENERATING SLR PLOTS ###################
432439
###################################################################
433440

441+
echo "Step 3b: Clustering SLRs; aligning clusters and generating plots"
442+
443+
434444
mkdir 3.SLR_plots
435445
cd 3.SLR_plots
436446

@@ -597,8 +607,6 @@ done
597607
echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.SLRs.bed
598608
cat ../2.PAVs_to_SLRs/${prefix}.SLRs.plus_clusters.tsv | awk -v cluster="${cluster}" '{if($5 == cluster) {print $2"\t"$3"\t"$4"\t"$1}}' >> ${cluster}.SLRs.bed
599609

600-
601-
602610
##add the contigs missing the elements to the complete contigs file
603611
cat ${cluster}.absent.contigs.fa >> ${cluster}.contigs.fa
604612

@@ -615,18 +623,27 @@ Rscriptpath=$( which gggenomes_skeleton.stargraph.R )
615623
cat ${Rscriptpath} | sed "s/CLUSTER/${cluster}/g" | sed "s|PATHTOOUTPUT|${outputpath}/3.SLR_plots|g" > ${cluster}.R
616624
Rscript ${cluster}.R
617625

626+
##remove some of the fasta files
627+
rm ${cluster}.contigs.fa
628+
rm ${cluster}.absent.*
629+
rm ${cluster}.contigs.nucmer.delta
630+
618631
done
619632

620633
##cleanup the default printing of Rplots
621634
rm Rplots.pdf
622635

636+
623637
cd ..
624638

625639
##########################################################################
626640
################# STEP 4: COMBINING SLRs AND STARSHIPS ###################
627641
##########################################################################
628642

629643

644+
echo "Step 4: Combining SLRs and Starships into a non-redundant dataset"
645+
646+
630647
##we now have out SLR dataset and can combine this with the Starship input from starfish
631648
##in doing so we actually do three simple steps
632649
##Step 1: subtract all the SLR regions that overlap with a starfish region
@@ -696,6 +713,9 @@ cd ..
696713
#######################################################################################
697714

698715

716+
echo "Step 5: Generating network plots and further clustering/alignments using the non-redundant dataset"
717+
718+
699719
##we now have our Starship-SLR dataset we can see what this combined dataset looks like
700720
##we can do this in terms of kmer similiarity initially
701721
##this allows us to do two thing; build networks of the similarities AND cluster Starships and SLRs and align them to one another
@@ -712,10 +732,11 @@ awk -F " " '{print $1}' ../4.SLR_starship_combination/${prefix}.starships_SLRs.f
712732
sourmash sketch dna -p k=31,noabund --singleton -o sourmash_signatures/ temp.fa
713733
##first we can get the jaccard similarity
714734
sourmash compare -k 31 sourmash_signatures/*.sig.gz --csv sourmash_signatures.compare_k31.jaccard.csv
715-
716735
##and second we can get the max pairwise containment score
717736
sourmash compare -k 31 sourmash_signatures/*.sig.gz --max-containment --csv sourmash_signatures.compare_k31.containment.csv
718737

738+
rm temp.fa
739+
719740
##need a very low threshold to remove all very small similarities, here using 10% jaccard similarity
720741
threshold="10"
721742
##generate header for the file that will be used to build the network
@@ -751,6 +772,22 @@ Rscript network_plotting.R
751772

752773

753774

775+
##cluster all the Starships and SLRs using the same kmer similarities (containment)
776+
##now use mcl to quickly find the clusters
777+
mcl ${prefix}.starships_SLRs.pairwise_containment.tsv --abc -o ${prefix}.starships_SLRs.pairwise_containment.mcl.txt
778+
##now name the clusters and then append to the summary files
779+
awk -F '\t' '{for (i=1; i <= NF; i++) {print "cluster"NR "\t" $i}}' ${prefix}.starships_SLRs.pairwise_containment.mcl.txt > ${prefix}.starships_SLRs.pairwise_containment.mcl.clusters.txt
780+
781+
echo "element;contig;start;end;cluster" | tr ';' '\t' > ${prefix}.starships_SLRs.plus_clusters.tsv
782+
cat ${prefix}.SLRs.tsv | while read line
783+
do
784+
SLR=$( echo "${line}" | awk '{print $1}' )
785+
cat ${prefix}.starships_SLRs.pairwise_containment.mcl.clusters.txt | awk -v SLR="$SLR" -v line="$line" '{if($2==SLR) print line"\t"$1}'
786+
done >> ${prefix}.starships_SLRs.plus_clusters.tsv
787+
788+
##now use these clusters to generate plots
789+
790+
754791

755792
cd ..
756793

0 commit comments

Comments
 (0)