@@ -770,23 +770,146 @@ cat ${Rscriptpath2} | sed "s/PREFIX/${prefix}/g" | sed "s|PATHTOOUTPUT|${outputp
770770Rscript network_plotting.R
771771
772772
773-
774-
773+ # #get rid of header before feeding the pairwise similarities to mcl
774+ tail -n+2 ${prefix} .starships_SLRs.pairwise_containment.tsv > temp.fa
775775# #cluster all the Starships and SLRs using the same kmer similarities (containment)
776776# #now use mcl to quickly find the clusters
777- mcl ${prefix} .starships_SLRs.pairwise_containment.tsv --abc -o ${prefix} .starships_SLRs.pairwise_containment.mcl.txt
777+ mcl temp.fa --abc -o ${prefix} .starships_SLRs.pairwise_containment.mcl.txt
778+ rm temp.fa
778779# #now name the clusters and then append to the summary files
779780awk -F ' \t' ' {for (i=1; i <= NF; i++) {print "cluster"NR "\t" $i}}' ${prefix} .starships_SLRs.pairwise_containment.mcl.txt > ${prefix} .starships_SLRs.pairwise_containment.mcl.clusters.txt
780781
781782echo " element;contig;start;end;cluster" | tr ' ;' ' \t' > ${prefix} .starships_SLRs.plus_clusters.tsv
782- cat ${prefix} .SLRs .tsv | while read line
783+ tail -n+2 ../ ${prefix} .starships_SLRs .tsv | awk ' {print $1"\t"$3"\t"$4"\t"$5} ' | while read line
783784do
784785SLR=$( echo " ${line} " | awk ' {print $1}' )
785786cat ${prefix} .starships_SLRs.pairwise_containment.mcl.clusters.txt | awk -v SLR=" $SLR " -v line=" $line " ' {if($2==SLR) print line"\t"$1}'
786787done >> ${prefix} .starships_SLRs.plus_clusters.tsv
787788
788789# #now use these clusters to generate plots
790+ # #first create a simplified bed file for the annotations
791+ # #create header for annotations
792+ echo " contig;start;end;sense;gene;label" | tr ' ;' ' \t' > genes.bed
793+ # #now grab the coordinates, sense, name of gene and a label just for tyrs, duf3723 and mybs
794+ cat ${tyrRspath} | awk -F " \t" ' {if($3 == "mRNA") print $1"\t"$4"\t"$5"\t"$7"\t"$9}' | awk -F " ;Name=" ' {print $0"\t"$NF}' | awk ' {print $1"\t"$2"\t"$3"\t"$4"\t"$6}' | awk -v identifier=" $identifier " ' {if($5 ~ "_"identifier) {print $0"\ttyrR"} else if($5 ~ "_myb") {print $0"\tmyb"} else if($5 ~ "_duf3723") {print $0"\tduf3723"} else {print $0"\tNA"}}' >> genes.bed
795+
796+
797+ # #loop through each cluster defined previously to get a list of the SLRs and a list of genomes missing these elements to be used as an insertion site
798+ tail -n+2 ${prefix} .starships_SLRs.plus_clusters.tsv | cut -f5 | sort -u | while read cluster
799+ do
800+
801+ # #get the elements
802+ cat ${prefix} .starships_SLRs.plus_clusters.tsv | awk -v cluster=" $cluster " ' {if($5 == cluster) {print $1}}' > ${cluster} .list.txt
803+
804+ # ##extract the SLRs PLUS the flanking regions around them into a single fasta for alignment (this will be used to identify a good region to visualise the insertion)
805+ # #also extract the full contigs in which the SLRs are found (this will be used for the actual alignment)
806+ if [ -f ${cluster} .regions_plus_flank.fa ]
807+ then
808+ rm ${cluster} .regions_plus_flank.fa
809+ fi
810+
811+ if [ -f ${cluster} .contigs.fa ]
812+ then
813+ rm ${cluster} .contigs.fa
814+ fi
815+
816+ echo " contig;start;end;SLR" | tr ' ;' ' \t' > ${cluster} .regions_plus_flank.tsv
817+ cat ${cluster} .list.txt | while read SLR
818+ do
819+ contig=$( cat ${prefix} .starships_SLRs.plus_clusters.tsv | awk -F " \t" -v SLR=" $SLR " ' {if($1 == SLR) print $2}' )
820+ contiglength=$( cat ../${prefix} .assemblies.fa.gz.fai | awk -v contig=" $contig " ' {if($1 == contig) print $2}' )
821+ cat ${prefix} .starships_SLRs.plus_clusters.tsv | awk -F " \t" -v SLR=" $SLR " -v flank=" $flank " ' {if($1 == SLR) print $2"\t"$3-flank"\t"$4+flank}' | awk ' {if($2 < 0 ) {print $1"\t1\t"$3} else {print}}' | awk -v SLR=" $SLR " -v contiglength=" $contiglength " ' {if($3 > contiglength ) {print $1"\t"$2"\t"contiglength"\t"SLR} else {print $0"\t"SLR}}'
822+ done >> ${cluster} .regions_plus_flank.tsv
823+ tail -n+2 ${cluster} .regions_plus_flank.tsv | while read line
824+ do
825+ coords=$( echo " ${line} " | awk ' {print $1":"$2"-"$3}' )
826+ contig=$( echo " ${line} " | awk ' {print $1}' )
827+ SLR=$( echo " ${line} " | awk ' {print $4}' )
828+ samtools faidx ../${prefix} .assemblies.fa.gz " ${coords} " | awk -v SLR=" $SLR " ' {if($0 ~ ">") {print ">"SLR">>"$0} else {print}}' | sed ' s/>>>/ /g' >> ${cluster} .regions_plus_flank.fa
829+ samtools faidx ../${prefix} .assemblies.fa.gz " ${contig} " >> ${cluster} .contigs.fa
830+ done
831+
832+ # #take just one of the SLRs in the cluster; one with the largest sum of flank lengths on either side or the largest (only one if they are equal)
833+ topSLR=$( cat ${cluster} .list.txt | while read SLR
834+ do
835+ start=$( cat ${prefix} .starships_SLRs.plus_clusters.tsv | awk -F " \t" -v SLR=" $SLR " ' {if($1 == SLR) print $3}' )
836+ end=$( cat ${prefix} .starships_SLRs.plus_clusters.tsv | awk -F " \t" -v SLR=" $SLR " ' {if($1 == SLR) print $4}' )
789837
838+ startmoddiff=$( cat ${cluster} .regions_plus_flank.tsv | awk -F " \t" -v SLR=" $SLR " -v start=" $start " ' {if($4 == SLR) print start-$2}' )
839+ endmoddiff=$( cat ${cluster} .regions_plus_flank.tsv | awk -F " \t" -v SLR=" $SLR " -v end=" $end " ' {if($4 == SLR) print $3-end}' )
840+
841+ echo " ${SLR} ;${startmoddiff} ;${endmoddiff} " | tr ' ;' ' \t'
842+
843+ done | awk -v flank=" $flank " ' {if(($2+$3) > sumflank) {sumflank=($2+$3); SLR=$1}} END{print SLR}' )
844+ # #save the SLR+flank region to a temporary fasta file
845+ samtools faidx ${cluster} .regions_plus_flank.fa ${topSLR} > ${cluster} .regions_plus_flank.temp.fa
846+
847+ # #create a header for a bed file to be populated
848+ # #this bed file will dictate the regions to be aligned (which will be relative to the extracted regions)
849+ echo " contig;start;end;SLR" | tr ' ;' ' \t' > ${cluster} .regions_plus_flank.plotting.bed
850+
851+ # #add the coordinates etc for the SLRs to the same bed file
852+ # #then adding the topSLR first so that it will be plotted downstream and show the insertion site for this element
853+ tail -n+2 ${cluster} .regions_plus_flank.tsv | awk -v topSLR=" $topSLR " ' {if($4 == topSLR) {print $1"\t"$2"\t"$3"\t"$4}}' >> ${cluster} .regions_plus_flank.plotting.bed
854+
855+ # #now we want to order the other SLR regions by similarity to the top SLR (based on the containment score)
856+ clusterlist=$( tail -n+2 ${cluster} .regions_plus_flank.tsv | awk -F " \t" ' {print $4}' | tr ' \n' ' ' )
857+
858+ awk -v top=" $topSLR " -v cluster=" $clusterlist " '
859+ BEGIN {
860+ split(cluster, clist, " ")
861+ for (i in clist) {
862+ clusterMap[clist[i]] = 1
863+ clusterCount++
864+ }
865+ }
866+ {
867+ if ($1 == top) {
868+ lines[$2] = $3
869+ } else if ($2 == top) {
870+ lines[$1] = $3
871+ }
872+ }
873+ END {
874+ n = asorti(lines, sorted, "@val_num_desc")
875+ found = 0
876+ for (i = 1; i <= n; i++) {
877+ target = sorted[i]
878+ if (target in clusterMap) {
879+ print target
880+ delete clusterMap[target]
881+ found++
882+ if (found == clusterCount) break
883+ }
884+ }
885+ }
886+ ' ${prefix} .starships_SLRs.pairwise_containment.tsv | while read element
887+ do
888+ tail -n+2 ${cluster} .regions_plus_flank.tsv | awk -v element=" $element " ' {if($4 == element) {print $1"\t"$2"\t"$3"\t"$4}}' >> ${cluster} .regions_plus_flank.plotting.bed
889+ done
890+
891+
892+ # #create a simple bed file for the SLRs regions
893+ echo " contig;start;end;SLR" | tr ' ;' ' \t' > ${cluster} .SLRs.bed
894+ cat ${prefix} .starships_SLRs.plus_clusters.tsv | awk -v cluster=" ${cluster} " ' {if($5 == cluster) {print $2"\t"$3"\t"$4"\t"$1}}' >> ${cluster} .SLRs.bed
895+
896+ # #generate all vs all alignments for the contigs
897+ # #remove self alignment and any alignment smaller than 1kb
898+ nucmer --maxmatch --minmatch 100 --delta ${cluster} .contigs.nucmer.delta ${cluster} .contigs.fa ${cluster} .contigs.fa
899+ paftools.js delta2paf ${cluster} .contigs.nucmer.delta | awk -F " \t" ' {if($1 != $6) {print}}' > ${cluster} .contigs.nucmer.paf
900+
901+
902+ # #automate the production of an R script using gggenomes to plot the alignment
903+ # #then use gggenome with R script to create the plots
904+ Rscriptpath=$( which gggenomes_skeleton.stargraph.R )
905+ cat ${Rscriptpath} | sed " s/CLUSTER/${cluster} /g" | sed " s|PATHTOOUTPUT|${outputpath} /5.SLR_starship_network_alignments|g" > ${cluster} .R
906+ Rscript ${cluster} .R
907+
908+ # #remove some of the fasta files
909+ rm ${cluster} .contigs.fa
910+ rm ${cluster} .contigs.nucmer.delta
911+
912+ done
790913
791914
792915cd ..
0 commit comments