Skip to content

Commit 7b2dfb8

Browse files
authored
Update stargraph.sh
fixed minimum alignment length for finding insertion contig; now just set to a minimum of 10kb instead of relative to the flank size as large flanks for plotting reduce the chances of finding large contiguous alignments and therefore eliminate insertion sites based on how they are found
1 parent 10ea5c8 commit 7b2dfb8

File tree

1 file changed

+29
-27
lines changed

1 file changed

+29
-27
lines changed

bin/stargraph.sh

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -419,8 +419,8 @@ sourmash sketch dna -p scaled=100,k=21 ${prefix}.SLRs.fa --singleton -o ${prefix
419419
##here we only use the maximum containment value for any pairwise comparison and therefore keeping the table symmetric
420420
sourmash compare ${prefix}.SLRs.sig --max-containment --csv ${prefix}.SLRs.sig.compare.csv --labels-to ${prefix}.SLRs.sig.compare.txt
421421
##convert to pairwise comparisons and change all values below 25% to 0
422-
minjac="0.25"
423-
cat ${prefix}.SLRs.sig.compare.csv | tr -d '\r' | awk -F',' 'NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS='\t' | awk -F "\t" -v minjac="$minjac" '{if($3 >= minjac) {print} else {print $1"\t"$2"\t0"}}' > ${prefix}.SLRs.sig.pairwise.tsv
422+
mincont="0.25"
423+
cat ${prefix}.SLRs.sig.compare.csv | tr -d '\r' | awk -F',' 'NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS='\t' | awk -F "\t" -v mincont="$mincont" '{if($3 >= mincont) {print} else {print $1"\t"$2"\t0"}}' > ${prefix}.SLRs.sig.pairwise.tsv
424424
##now use mcl to quickly find the clusters
425425
mcl ${prefix}.SLRs.sig.pairwise.tsv --abc -o ${prefix}.SLRs.sig.pairwise.mcl.txt
426426
##now name the clusters and then append to the summary files
@@ -477,9 +477,10 @@ echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.regions_plus_flank.tsv
477477
cat ${cluster}.list.txt | while read SLR
478478
do
479479
contig=$( cat ../2.PAVs_to_SLRs/${prefix}.SLRs.tsv | awk -F "\t" -v SLR="$SLR" '{if($1 == SLR) print $2}' )
480-
contiglength=$( cat ../${prefix}.assemblies.fa.gz.fai | awk -v contig="$contig" '{if($1 == contig) print $2}' )
481-
cat ../2.PAVs_to_SLRs/${prefix}.SLRs.tsv | awk -F "\t" -v SLR="$SLR" -v flank="$flank" '{if($1 == SLR) print $2"\t"$3-flank"\t"$4+flank}' | awk '{if($2 < 0 ) {print $1"\t1\t"$3} else {print}}' | awk -v SLR="$SLR" -v contiglength="$contiglength" '{if($3 > contiglength ) {print $1"\t"$2"\t"contiglength"\t"SLR} else {print $0"\t"SLR}}'
480+
contiglength=$( cat ../${prefix}.assemblies.fa.gz.fai | awk -F "\t" -v contig="$contig" '{if($1 == contig) print $2}' )
481+
cat ../2.PAVs_to_SLRs/${prefix}.SLRs.tsv | awk -F "\t" -v SLR="$SLR" -v flank="$flank" '{if($1 == SLR) print $2"\t"$3-flank"\t"$4+flank}' | awk -F "\t" '{if($2 < 0 ) {print $1"\t1\t"$3} else {print}}' | awk -F "\t" -v SLR="$SLR" -v contiglength="$contiglength" '{if($3 > contiglength ) {print $1"\t"$2"\t"contiglength"\t"SLR} else {print $0"\t"SLR}}'
482482
done >> ${cluster}.regions_plus_flank.tsv
483+
483484
tail -n+2 ${cluster}.regions_plus_flank.tsv | while read line
484485
do
485486
coords=$( echo "${line}" | awk '{print $1":"$2"-"$3}' )
@@ -501,6 +502,7 @@ endmoddiff=$( cat ${cluster}.regions_plus_flank.tsv | awk -F "\t" -v SLR="$SLR"
501502
echo "${SLR};${startmoddiff};${endmoddiff}" | tr ';' '\t'
502503
503504
done | awk -v flank="$flank" '{if(($2+$3) > sumflank) {sumflank=($2+$3); SLR=$1}} END{print SLR}' )
505+
504506
##save the SLR+flank region to a temporary fasta file
505507
samtools faidx ${cluster}.regions_plus_flank.fa ${topSLR} > ${cluster}.regions_plus_flank.temp.fa
506508

@@ -528,20 +530,28 @@ nucmer -t ${threads} --maxmatch --minmatch 100 --delta ${cluster}.regions_plus_
528530
paftools.js delta2paf ${cluster}.regions_plus_flank.absent.nucmer.delta > ${cluster}.regions_plus_flank.absent.nucmer.paf
529531

530532
##now use the paf file to find a contigs with good aligning regions
531-
##good aligning can be that at least a single contig has a single alignment larger than half the flank region
533+
##good aligning can be that at least a single contig with a minimum alignment length of 10kb
532534
##take the best two aligning contigs from the dataset (ideally it'll be large contigs from two different genomes)
533-
##then try to find the edges of the alignments using 20kb seeds (this will be used for the plotting; i.e. only this regions alignment visualised)
535+
##then try to find the edges of the alignments using 10kb seeds (this will be used for the plotting; i.e. only this regions alignment visualised)
534536

535-
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v flank="${flank}" '{if($11 > (flank/2) ) print}' | cut -f1 | sort -u | while read tempcontig
537+
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v flank="$flank" '{if($11 > (10000) ) print}' | cut -f1 | sort -u | while read tempcontig
536538
do
537539
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v tempcontig="$tempcontig" '{if($1 == tempcontig) sum=sum+$11} END{print tempcontig"\t"sum}'
538-
done | sort -k2nr | head -n2 | awk '{print $1}' | while read insertioncontig
540+
done | sort -k2nr | head -n2 | awk '{print $1}' > ${cluster}.absent.contigs.txt
541+
542+
##extract the contigs
543+
cat ${cluster}.absent.contigs.txt | while read insertioncontig
544+
do
545+
samtools faidx ${cluster}.absent.fa "${insertioncontig}"
546+
done > ${cluster}.absent.contigs.fa
547+
548+
##get details on the aligned region for plotting (because we only aligned the example element 'top SLR' to the contigs, if we take the maximum region aligned, this should correspond to the edges of the flanks)
549+
cat ${cluster}.absent.contigs.txt | while read insertioncontig
539550
do
540551
strain=$( echo "${insertioncontig}" | awk -F "${separator}" '{print $1}' )
541552
edges=$( cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" '{if($11 > 20000) print}' | awk -F "\t" -v insertioncontig="$insertioncontig" '{if($1==insertioncontig) print}' | awk -v flank="$flank" 'BEGIN{max=0; min=99999999999999} {if($4 > max) {max=$4}; if($3 < min) {min=$3}} END{print min-(flank/2)"\t"max+(flank/2)}' | awk -F "\t" '{if($1 < 0) {print 0"\t"$2} else {print}}' )
542553
edges2=$( echo "${edges}" | awk -F "\t" '{print $1"-"$2}' )
543554
size=$( echo "${edges}" | awk -F "\t" '{print $2-$1}' )
544-
samtools faidx ${cluster}.absent.fa "${insertioncontig}" >> ${cluster}.absent.contigs.fa
545555
echo "${insertioncontig};${edges};NA" | tr ';' '\t' >> ${cluster}.regions_plus_flank.plotting.bed
546556
done
547557

@@ -591,7 +601,6 @@ do
591601
tail -n+2 ${cluster}.regions_plus_flank.tsv | awk -v element="$element" '{if($4 == element) {print $1"\t"$2"\t"$3"\t"$4}}' >> ${cluster}.regions_plus_flank.plotting.bed
592602
done
593603

594-
595604
##create a simple bed file for the SLRs regions
596605
echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.SLRs.bed
597606
cat ../2.PAVs_to_SLRs/${prefix}.SLRs.plus_clusters.tsv | awk -v cluster="${cluster}" '{if($5 == cluster) {print $2"\t"$3"\t"$4"\t"$1}}' >> ${cluster}.SLRs.bed
@@ -602,7 +611,7 @@ cat ${cluster}.absent.contigs.fa >> ${cluster}.contigs.fa
602611

603612
##generate all vs all alignments for the contigs
604613
##remove self alignment and any alignment smaller than 1kb
605-
nucmer --maxmatch --minmatch 100 --delta ${cluster}.contigs.nucmer.delta ${cluster}.contigs.fa ${cluster}.contigs.fa
614+
nucmer -t ${threads} --maxmatch --minmatch 100 --delta ${cluster}.contigs.nucmer.delta ${cluster}.contigs.fa ${cluster}.contigs.fa
606615
paftools.js delta2paf ${cluster}.contigs.nucmer.delta | awk -F "\t" '{if($1 != $6) {print}}' > ${cluster}.contigs.nucmer.paf
607616

608617

@@ -702,7 +711,7 @@ cd ..
702711
#######################################################################################
703712

704713

705-
echo "Step 5: Generating network plots and further clustering/alignments using the non-redundant dataset"
714+
echo "Step 5a: Generating network plots using the non-redundant dataset"
706715

707716

708717
##we now have our Starship-SLR dataset we can see what this combined dataset looks like
@@ -726,17 +735,17 @@ sourmash compare -k 31 sourmash_signatures/*.sig.gz --max-containment --csv sour
726735

727736
rm temp.fa
728737

729-
##need a very low threshold to remove all very small similarities, here using 10% jaccard similarity
730-
threshold="10"
738+
##need a very low threshold to remove all very small similarities, here using 25% jaccard similarity/max-containment
739+
mincont="0.25"
731740
##generate header for the file that will be used to build the network
732741
echo "to;from;weight" | tr ';' '\t' > ${prefix}.starships_SLRs.pairwise_jaccard.tsv
733742
##now get a list of nonredundant pairwise jaccard similarities
734-
cat sourmash_signatures.compare_k31.jaccard.csv | tr -d '\r' | awk -F',' 'NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS='\t' | awk -F "\t" -v threshold="$threshold" '{if($3*100 > threshold) {print}}' >> ${prefix}.starships_SLRs.pairwise_jaccard.tsv
743+
cat sourmash_signatures.compare_k31.jaccard.csv | tr -d '\r' | awk -F',' 'NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS='\t' | awk -F "\t" -v mincont="$mincont" '{if($3*100 > mincont) {print}}' >> ${prefix}.starships_SLRs.pairwise_jaccard.tsv
735744

736745
##same but for the containment scores (we used max containment so the pairwise values are symmetric making this easy)
737746
echo "to;from;weight" | tr ';' '\t' > ${prefix}.starships_SLRs.pairwise_containment.tsv
738747
##now get a list of nonredundant pairwise jaccard similarities
739-
cat sourmash_signatures.compare_k31.containment.csv | tr -d '\r' | awk -F',' 'NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS='\t' | awk -F "\t" -v threshold="$threshold" '{if($3*100 > threshold) {print}}' >> ${prefix}.starships_SLRs.pairwise_containment.tsv
748+
cat sourmash_signatures.compare_k31.containment.csv | tr -d '\r' | awk -F',' 'NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS='\t' | awk -F "\t" -v mincont="$mincont" '{if($3*100 > mincont) {print}}' >> ${prefix}.starships_SLRs.pairwise_containment.tsv
740749

741750

742751
##also want a simplified metadata file used for plotting the networks
@@ -759,6 +768,9 @@ cat ${Rscriptpath2} | sed "s/PREFIX/${prefix}/g" | sed "s|PATHTOOUTPUT|${outputp
759768
Rscript network_plotting.R
760769

761770

771+
echo "Step 5b: Clustering Starships and SLRs and generating alignment plots"
772+
773+
762774
##get rid of header before feeding the pairwise similarities to mcl
763775
tail -n+2 ${prefix}.starships_SLRs.pairwise_containment.tsv > temp.fa
764776
##cluster all the Starships and SLRs using the same kmer similarities (containment)
@@ -792,16 +804,6 @@ cat ${prefix}.starships_SLRs.plus_clusters.tsv | awk -v cluster="$cluster" '{if(
792804

793805
###extract the SLRs PLUS the flanking regions around them into a single fasta for alignment (this will be used to identify a good region to visualise the insertion)
794806
##also extract the full contigs in which the SLRs are found (this will be used for the actual alignment)
795-
if [ -f ${cluster}.regions_plus_flank.fa ]
796-
then
797-
rm ${cluster}.regions_plus_flank.fa
798-
fi
799-
800-
if [ -f ${cluster}.contigs.fa ]
801-
then
802-
rm ${cluster}.contigs.fa
803-
fi
804-
805807
echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.regions_plus_flank.tsv
806808
cat ${cluster}.list.txt | while read SLR
807809
do
@@ -884,7 +886,7 @@ cat ${prefix}.starships_SLRs.plus_clusters.tsv | awk -v cluster="${cluster}" '{i
884886

885887
##generate all vs all alignments for the contigs
886888
##remove self alignment and any alignment smaller than 1kb
887-
nucmer --maxmatch --minmatch 100 --delta ${cluster}.contigs.nucmer.delta ${cluster}.contigs.fa ${cluster}.contigs.fa
889+
nucmer -t ${threads} --maxmatch --minmatch 100 --delta ${cluster}.contigs.nucmer.delta ${cluster}.contigs.fa ${cluster}.contigs.fa
888890
paftools.js delta2paf ${cluster}.contigs.nucmer.delta | awk -F "\t" '{if($1 != $6) {print}}' > ${cluster}.contigs.nucmer.paf
889891

890892

0 commit comments

Comments
 (0)