@@ -419,8 +419,8 @@ sourmash sketch dna -p scaled=100,k=21 ${prefix}.SLRs.fa --singleton -o ${prefix
419419# #here we only use the maximum containment value for any pairwise comparison and therefore keeping the table symmetric
420420sourmash compare ${prefix} .SLRs.sig --max-containment --csv ${prefix} .SLRs.sig.compare.csv --labels-to ${prefix} .SLRs.sig.compare.txt
421421# #convert to pairwise comparisons and change all values below 25% to 0
422- minjac =" 0.25"
423- cat ${prefix} .SLRs.sig.compare.csv | tr -d ' \r' | awk -F' ,' ' NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS=' \t' | awk -F " \t" -v minjac =" $minjac " ' {if($3 >= minjac ) {print} else {print $1"\t"$2"\t0"}}' > ${prefix} .SLRs.sig.pairwise.tsv
422+ mincont =" 0.25"
423+ cat ${prefix} .SLRs.sig.compare.csv | tr -d ' \r' | awk -F' ,' ' NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS=' \t' | awk -F " \t" -v mincont =" $mincont " ' {if($3 >= mincont ) {print} else {print $1"\t"$2"\t0"}}' > ${prefix} .SLRs.sig.pairwise.tsv
424424# #now use mcl to quickly find the clusters
425425mcl ${prefix} .SLRs.sig.pairwise.tsv --abc -o ${prefix} .SLRs.sig.pairwise.mcl.txt
426426# #now name the clusters and then append to the summary files
@@ -477,9 +477,10 @@ echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.regions_plus_flank.tsv
477477cat ${cluster} .list.txt | while read SLR
478478do
479479contig=$( cat ../2.PAVs_to_SLRs/${prefix} .SLRs.tsv | awk -F " \t" -v SLR=" $SLR " ' {if($1 == SLR) print $2}' )
480- contiglength=$( cat ../${prefix} .assemblies.fa.gz.fai | awk -v contig=" $contig " ' {if($1 == contig) print $2}' )
481- cat ../2.PAVs_to_SLRs/${prefix} .SLRs.tsv | awk -F " \t" -v SLR=" $SLR " -v flank=" $flank " ' {if($1 == SLR) print $2"\t"$3-flank"\t"$4+flank}' | awk ' {if($2 < 0 ) {print $1"\t1\t"$3} else {print}}' | awk -v SLR=" $SLR " -v contiglength=" $contiglength " ' {if($3 > contiglength ) {print $1"\t"$2"\t"contiglength"\t"SLR} else {print $0"\t"SLR}}'
480+ contiglength=$( cat ../${prefix} .assemblies.fa.gz.fai | awk -F " \t " - v contig=" $contig " ' {if($1 == contig) print $2}' )
481+ cat ../2.PAVs_to_SLRs/${prefix} .SLRs.tsv | awk -F " \t" -v SLR=" $SLR " -v flank=" $flank " ' {if($1 == SLR) print $2"\t"$3-flank"\t"$4+flank}' | awk -F " \t " ' {if($2 < 0 ) {print $1"\t1\t"$3} else {print}}' | awk -F " \t " -v SLR=" $SLR " -v contiglength=" $contiglength " ' {if($3 > contiglength ) {print $1"\t"$2"\t"contiglength"\t"SLR} else {print $0"\t"SLR}}'
482482done >> ${cluster} .regions_plus_flank.tsv
483+
483484tail -n+2 ${cluster} .regions_plus_flank.tsv | while read line
484485do
485486coords=$( echo " ${line} " | awk ' {print $1":"$2"-"$3}' )
@@ -501,6 +502,7 @@ endmoddiff=$( cat ${cluster}.regions_plus_flank.tsv | awk -F "\t" -v SLR="$SLR"
501502echo " ${SLR} ;${startmoddiff} ;${endmoddiff} " | tr ' ;' ' \t'
502503
503504done | awk -v flank=" $flank " ' {if(($2+$3) > sumflank) {sumflank=($2+$3); SLR=$1}} END{print SLR}' )
505+
504506# #save the SLR+flank region to a temporary fasta file
505507samtools faidx ${cluster} .regions_plus_flank.fa ${topSLR} > ${cluster} .regions_plus_flank.temp.fa
506508
@@ -528,20 +530,28 @@ nucmer -t ${threads} --maxmatch --minmatch 100 --delta ${cluster}.regions_plus_
528530paftools.js delta2paf ${cluster} .regions_plus_flank.absent.nucmer.delta > ${cluster} .regions_plus_flank.absent.nucmer.paf
529531
530532# #now use the paf file to find a contigs with good aligning regions
531- # #good aligning can be that at least a single contig has a single alignment larger than half the flank region
533+ # #good aligning can be that at least a single contig with a minimum alignment length of 10kb
532534# #take the best two aligning contigs from the dataset (ideally it'll be large contigs from two different genomes)
533- # #then try to find the edges of the alignments using 20kb seeds (this will be used for the plotting; i.e. only this regions alignment visualised)
535+ # #then try to find the edges of the alignments using 10kb seeds (this will be used for the plotting; i.e. only this regions alignment visualised)
534536
535- cat ${cluster} .regions_plus_flank.absent.nucmer.paf | awk -F " \t" -v flank=" ${ flank} " ' {if($11 > (flank/2 ) ) print}' | cut -f1 | sort -u | while read tempcontig
537+ cat ${cluster} .regions_plus_flank.absent.nucmer.paf | awk -F " \t" -v flank=" $flank " ' {if($11 > (10000 ) ) print}' | cut -f1 | sort -u | while read tempcontig
536538do
537539cat ${cluster} .regions_plus_flank.absent.nucmer.paf | awk -F " \t" -v tempcontig=" $tempcontig " ' {if($1 == tempcontig) sum=sum+$11} END{print tempcontig"\t"sum}'
538- done | sort -k2nr | head -n2 | awk ' {print $1}' | while read insertioncontig
540+ done | sort -k2nr | head -n2 | awk ' {print $1}' > ${cluster} .absent.contigs.txt
541+
542+ # #extract the contigs
543+ cat ${cluster} .absent.contigs.txt | while read insertioncontig
544+ do
545+ samtools faidx ${cluster} .absent.fa " ${insertioncontig} "
546+ done > ${cluster} .absent.contigs.fa
547+
548+ # #get details on the aligned region for plotting (because we only aligned the example element 'top SLR' to the contigs, if we take the maximum region aligned, this should correspond to the edges of the flanks)
549+ cat ${cluster} .absent.contigs.txt | while read insertioncontig
539550do
540551strain=$( echo " ${insertioncontig} " | awk -F " ${separator} " ' {print $1}' )
541552edges=$( cat ${cluster} .regions_plus_flank.absent.nucmer.paf | awk -F " \t" ' {if($11 > 20000) print}' | awk -F " \t" -v insertioncontig=" $insertioncontig " ' {if($1==insertioncontig) print}' | awk -v flank=" $flank " ' BEGIN{max=0; min=99999999999999} {if($4 > max) {max=$4}; if($3 < min) {min=$3}} END{print min-(flank/2)"\t"max+(flank/2)}' | awk -F " \t" ' {if($1 < 0) {print 0"\t"$2} else {print}}' )
542553edges2=$( echo " ${edges} " | awk -F " \t" ' {print $1"-"$2}' )
543554size=$( echo " ${edges} " | awk -F " \t" ' {print $2-$1}' )
544- samtools faidx ${cluster} .absent.fa " ${insertioncontig} " >> ${cluster} .absent.contigs.fa
545555echo " ${insertioncontig} ;${edges} ;NA" | tr ' ;' ' \t' >> ${cluster} .regions_plus_flank.plotting.bed
546556done
547557
591601tail -n+2 ${cluster} .regions_plus_flank.tsv | awk -v element=" $element " ' {if($4 == element) {print $1"\t"$2"\t"$3"\t"$4}}' >> ${cluster} .regions_plus_flank.plotting.bed
592602done
593603
594-
595604# #create a simple bed file for the SLRs regions
596605echo " contig;start;end;SLR" | tr ' ;' ' \t' > ${cluster} .SLRs.bed
597606cat ../2.PAVs_to_SLRs/${prefix} .SLRs.plus_clusters.tsv | awk -v cluster=" ${cluster} " ' {if($5 == cluster) {print $2"\t"$3"\t"$4"\t"$1}}' >> ${cluster} .SLRs.bed
@@ -602,7 +611,7 @@ cat ${cluster}.absent.contigs.fa >> ${cluster}.contigs.fa
602611
603612# #generate all vs all alignments for the contigs
604613# #remove self alignment and any alignment smaller than 1kb
605- nucmer --maxmatch --minmatch 100 --delta ${cluster} .contigs.nucmer.delta ${cluster} .contigs.fa ${cluster} .contigs.fa
614+ nucmer -t ${threads} - -maxmatch --minmatch 100 --delta ${cluster} .contigs.nucmer.delta ${cluster} .contigs.fa ${cluster} .contigs.fa
606615paftools.js delta2paf ${cluster} .contigs.nucmer.delta | awk -F " \t" ' {if($1 != $6) {print}}' > ${cluster} .contigs.nucmer.paf
607616
608617
@@ -702,7 +711,7 @@ cd ..
702711# ######################################################################################
703712
704713
705- echo " Step 5 : Generating network plots and further clustering/alignments using the non-redundant dataset"
714+ echo " Step 5a : Generating network plots using the non-redundant dataset"
706715
707716
708717# #we now have our Starship-SLR dataset we can see what this combined dataset looks like
@@ -726,17 +735,17 @@ sourmash compare -k 31 sourmash_signatures/*.sig.gz --max-containment --csv sour
726735
727736rm temp.fa
728737
729- # #need a very low threshold to remove all very small similarities, here using 10 % jaccard similarity
730- threshold= " 10 "
738+ # #need a very low threshold to remove all very small similarities, here using 25 % jaccard similarity/max-containment
739+ mincont= " 0.25 "
731740# #generate header for the file that will be used to build the network
732741echo " to;from;weight" | tr ' ;' ' \t' > ${prefix} .starships_SLRs.pairwise_jaccard.tsv
733742# #now get a list of nonredundant pairwise jaccard similarities
734- cat sourmash_signatures.compare_k31.jaccard.csv | tr -d ' \r' | awk -F' ,' ' NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS=' \t' | awk -F " \t" -v threshold =" $threshold " ' {if($3*100 > threshold ) {print}}' >> ${prefix} .starships_SLRs.pairwise_jaccard.tsv
743+ cat sourmash_signatures.compare_k31.jaccard.csv | tr -d ' \r' | awk -F' ,' ' NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS=' \t' | awk -F " \t" -v mincont =" $mincont " ' {if($3*100 > mincont ) {print}}' >> ${prefix} .starships_SLRs.pairwise_jaccard.tsv
735744
736745# #same but for the containment scores (we used max containment so the pairwise values are symmetric making this easy)
737746echo " to;from;weight" | tr ' ;' ' \t' > ${prefix} .starships_SLRs.pairwise_containment.tsv
738747# #now get a list of nonredundant pairwise jaccard similarities
739- cat sourmash_signatures.compare_k31.containment.csv | tr -d ' \r' | awk -F' ,' ' NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS=' \t' | awk -F " \t" -v threshold =" $threshold " ' {if($3*100 > threshold ) {print}}' >> ${prefix} .starships_SLRs.pairwise_containment.tsv
748+ cat sourmash_signatures.compare_k31.containment.csv | tr -d ' \r' | awk -F' ,' ' NR==1{for(i=1;i<=NF;i++)samples[i]=$i;next}{row=NR-1;for(i=row+1;i<=NF;i++)print samples[row],samples[i],$i}' OFS=' \t' | awk -F " \t" -v mincont =" $mincont " ' {if($3*100 > mincont ) {print}}' >> ${prefix} .starships_SLRs.pairwise_containment.tsv
740749
741750
742751# #also want a simplified metadata file used for plotting the networks
@@ -759,6 +768,9 @@ cat ${Rscriptpath2} | sed "s/PREFIX/${prefix}/g" | sed "s|PATHTOOUTPUT|${outputp
759768Rscript network_plotting.R
760769
761770
771+ echo " Step 5b: Clustering Starships and SLRs and generating alignment plots"
772+
773+
762774# #get rid of header before feeding the pairwise similarities to mcl
763775tail -n+2 ${prefix} .starships_SLRs.pairwise_containment.tsv > temp.fa
764776# #cluster all the Starships and SLRs using the same kmer similarities (containment)
@@ -792,16 +804,6 @@ cat ${prefix}.starships_SLRs.plus_clusters.tsv | awk -v cluster="$cluster" '{if(
792804
793805# ##extract the SLRs PLUS the flanking regions around them into a single fasta for alignment (this will be used to identify a good region to visualise the insertion)
794806# #also extract the full contigs in which the SLRs are found (this will be used for the actual alignment)
795- if [ -f ${cluster} .regions_plus_flank.fa ]
796- then
797- rm ${cluster} .regions_plus_flank.fa
798- fi
799-
800- if [ -f ${cluster} .contigs.fa ]
801- then
802- rm ${cluster} .contigs.fa
803- fi
804-
805807echo " contig;start;end;SLR" | tr ' ;' ' \t' > ${cluster} .regions_plus_flank.tsv
806808cat ${cluster} .list.txt | while read SLR
807809do
@@ -884,7 +886,7 @@ cat ${prefix}.starships_SLRs.plus_clusters.tsv | awk -v cluster="${cluster}" '{i
884886
885887# #generate all vs all alignments for the contigs
886888# #remove self alignment and any alignment smaller than 1kb
887- nucmer --maxmatch --minmatch 100 --delta ${cluster} .contigs.nucmer.delta ${cluster} .contigs.fa ${cluster} .contigs.fa
889+ nucmer -t ${threads} - -maxmatch --minmatch 100 --delta ${cluster} .contigs.nucmer.delta ${cluster} .contigs.fa ${cluster} .contigs.fa
888890paftools.js delta2paf ${cluster} .contigs.nucmer.delta | awk -F " \t" ' {if($1 != $6) {print}}' > ${cluster} .contigs.nucmer.paf
889891
890892
0 commit comments