Skip to content

Commit 6dabe90

Browse files
authored
Update stargraph.sh
1 parent 6b8f743 commit 6dabe90

File tree

1 file changed

+127
-4
lines changed

1 file changed

+127
-4
lines changed

bin/stargraph.sh

Lines changed: 127 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -770,23 +770,146 @@ cat ${Rscriptpath2} | sed "s/PREFIX/${prefix}/g" | sed "s|PATHTOOUTPUT|${outputp
770770
Rscript network_plotting.R
771771

772772

773-
774-
773+
##get rid of header before feeding the pairwise similarities to mcl
774+
tail -n+2 ${prefix}.starships_SLRs.pairwise_containment.tsv > temp.fa
775775
##cluster all the Starships and SLRs using the same kmer similarities (containment)
776776
##now use mcl to quickly find the clusters
777-
mcl ${prefix}.starships_SLRs.pairwise_containment.tsv --abc -o ${prefix}.starships_SLRs.pairwise_containment.mcl.txt
777+
mcl temp.fa --abc -o ${prefix}.starships_SLRs.pairwise_containment.mcl.txt
778+
rm temp.fa
778779
##now name the clusters and then append to the summary files
779780
awk -F '\t' '{for (i=1; i <= NF; i++) {print "cluster"NR "\t" $i}}' ${prefix}.starships_SLRs.pairwise_containment.mcl.txt > ${prefix}.starships_SLRs.pairwise_containment.mcl.clusters.txt
780781

781782
echo "element;contig;start;end;cluster" | tr ';' '\t' > ${prefix}.starships_SLRs.plus_clusters.tsv
782-
cat ${prefix}.SLRs.tsv | while read line
783+
tail -n+2 ../${prefix}.starships_SLRs.tsv | awk '{print $1"\t"$3"\t"$4"\t"$5}' | while read line
783784
do
784785
SLR=$( echo "${line}" | awk '{print $1}' )
785786
cat ${prefix}.starships_SLRs.pairwise_containment.mcl.clusters.txt | awk -v SLR="$SLR" -v line="$line" '{if($2==SLR) print line"\t"$1}'
786787
done >> ${prefix}.starships_SLRs.plus_clusters.tsv
787788

788789
##now use these clusters to generate plots
790+
##first create a simplified bed file for the annotations
791+
##create header for annotations
792+
echo "contig;start;end;sense;gene;label" | tr ';' '\t' > genes.bed
793+
##now grab the coordinates, sense, name of gene and a label just for tyrs, duf3723 and mybs
794+
cat ${tyrRspath} | awk -F "\t" '{if($3 == "mRNA") print $1"\t"$4"\t"$5"\t"$7"\t"$9}' | awk -F ";Name=" '{print $0"\t"$NF}' | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$6}' | awk -v identifier="$identifier" '{if($5 ~ "_"identifier) {print $0"\ttyrR"} else if($5 ~ "_myb") {print $0"\tmyb"} else if($5 ~ "_duf3723") {print $0"\tduf3723"} else {print $0"\tNA"}}' >> genes.bed
795+
796+
797+
##loop through each cluster defined previously to get a list of the SLRs and a list of genomes missing these elements to be used as an insertion site
798+
tail -n+2 ${prefix}.starships_SLRs.plus_clusters.tsv | cut -f5 | sort -u | while read cluster
799+
do
800+
801+
##get the elements
802+
cat ${prefix}.starships_SLRs.plus_clusters.tsv | awk -v cluster="$cluster" '{if($5 == cluster) {print $1}}' > ${cluster}.list.txt
803+
804+
###extract the SLRs PLUS the flanking regions around them into a single fasta for alignment (this will be used to identify a good region to visualise the insertion)
805+
##also extract the full contigs in which the SLRs are found (this will be used for the actual alignment)
806+
if [ -f ${cluster}.regions_plus_flank.fa ]
807+
then
808+
rm ${cluster}.regions_plus_flank.fa
809+
fi
810+
811+
if [ -f ${cluster}.contigs.fa ]
812+
then
813+
rm ${cluster}.contigs.fa
814+
fi
815+
816+
echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.regions_plus_flank.tsv
817+
cat ${cluster}.list.txt | while read SLR
818+
do
819+
contig=$( cat ${prefix}.starships_SLRs.plus_clusters.tsv | awk -F "\t" -v SLR="$SLR" '{if($1 == SLR) print $2}' )
820+
contiglength=$( cat ../${prefix}.assemblies.fa.gz.fai | awk -v contig="$contig" '{if($1 == contig) print $2}' )
821+
cat ${prefix}.starships_SLRs.plus_clusters.tsv | awk -F "\t" -v SLR="$SLR" -v flank="$flank" '{if($1 == SLR) print $2"\t"$3-flank"\t"$4+flank}' | awk '{if($2 < 0 ) {print $1"\t1\t"$3} else {print}}' | awk -v SLR="$SLR" -v contiglength="$contiglength" '{if($3 > contiglength ) {print $1"\t"$2"\t"contiglength"\t"SLR} else {print $0"\t"SLR}}'
822+
done >> ${cluster}.regions_plus_flank.tsv
823+
tail -n+2 ${cluster}.regions_plus_flank.tsv | while read line
824+
do
825+
coords=$( echo "${line}" | awk '{print $1":"$2"-"$3}' )
826+
contig=$( echo "${line}" | awk '{print $1}' )
827+
SLR=$( echo "${line}" | awk '{print $4}' )
828+
samtools faidx ../${prefix}.assemblies.fa.gz "${coords}" | awk -v SLR="$SLR" '{if($0 ~ ">") {print ">"SLR">>"$0} else {print}}' | sed 's/>>>/ /g' >> ${cluster}.regions_plus_flank.fa
829+
samtools faidx ../${prefix}.assemblies.fa.gz "${contig}" >> ${cluster}.contigs.fa
830+
done
831+
832+
##take just one of the SLRs in the cluster; one with the largest sum of flank lengths on either side or the largest (only one if they are equal)
833+
topSLR=$( cat ${cluster}.list.txt | while read SLR
834+
do
835+
start=$( cat ${prefix}.starships_SLRs.plus_clusters.tsv | awk -F "\t" -v SLR="$SLR" '{if($1 == SLR) print $3}' )
836+
end=$( cat ${prefix}.starships_SLRs.plus_clusters.tsv | awk -F "\t" -v SLR="$SLR" '{if($1 == SLR) print $4}' )
789837
838+
startmoddiff=$( cat ${cluster}.regions_plus_flank.tsv | awk -F "\t" -v SLR="$SLR" -v start="$start" '{if($4 == SLR) print start-$2}' )
839+
endmoddiff=$( cat ${cluster}.regions_plus_flank.tsv | awk -F "\t" -v SLR="$SLR" -v end="$end" '{if($4 == SLR) print $3-end}' )
840+
841+
echo "${SLR};${startmoddiff};${endmoddiff}" | tr ';' '\t'
842+
843+
done | awk -v flank="$flank" '{if(($2+$3) > sumflank) {sumflank=($2+$3); SLR=$1}} END{print SLR}' )
844+
##save the SLR+flank region to a temporary fasta file
845+
samtools faidx ${cluster}.regions_plus_flank.fa ${topSLR} > ${cluster}.regions_plus_flank.temp.fa
846+
847+
##create a header for a bed file to be populated
848+
##this bed file will dictate the regions to be aligned (which will be relative to the extracted regions)
849+
echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.regions_plus_flank.plotting.bed
850+
851+
##add the coordinates etc for the SLRs to the same bed file
852+
##then adding the topSLR first so that it will be plotted downstream and show the insertion site for this element
853+
tail -n+2 ${cluster}.regions_plus_flank.tsv | awk -v topSLR="$topSLR" '{if($4 == topSLR) {print $1"\t"$2"\t"$3"\t"$4}}' >> ${cluster}.regions_plus_flank.plotting.bed
854+
855+
##now we want to order the other SLR regions by similarity to the top SLR (based on the containment score)
856+
clusterlist=$( tail -n+2 ${cluster}.regions_plus_flank.tsv | awk -F "\t" '{print $4}' | tr '\n' ' ' )
857+
858+
awk -v top="$topSLR" -v cluster="$clusterlist" '
859+
BEGIN {
860+
split(cluster, clist, " ")
861+
for (i in clist) {
862+
clusterMap[clist[i]] = 1
863+
clusterCount++
864+
}
865+
}
866+
{
867+
if ($1 == top) {
868+
lines[$2] = $3
869+
} else if ($2 == top) {
870+
lines[$1] = $3
871+
}
872+
}
873+
END {
874+
n = asorti(lines, sorted, "@val_num_desc")
875+
found = 0
876+
for (i = 1; i <= n; i++) {
877+
target = sorted[i]
878+
if (target in clusterMap) {
879+
print target
880+
delete clusterMap[target]
881+
found++
882+
if (found == clusterCount) break
883+
}
884+
}
885+
}
886+
' ${prefix}.starships_SLRs.pairwise_containment.tsv | while read element
887+
do
888+
tail -n+2 ${cluster}.regions_plus_flank.tsv | awk -v element="$element" '{if($4 == element) {print $1"\t"$2"\t"$3"\t"$4}}' >> ${cluster}.regions_plus_flank.plotting.bed
889+
done
890+
891+
892+
##create a simple bed file for the SLRs regions
893+
echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.SLRs.bed
894+
cat ${prefix}.starships_SLRs.plus_clusters.tsv | awk -v cluster="${cluster}" '{if($5 == cluster) {print $2"\t"$3"\t"$4"\t"$1}}' >> ${cluster}.SLRs.bed
895+
896+
##generate all vs all alignments for the contigs
897+
##remove self alignment and any alignment smaller than 1kb
898+
nucmer --maxmatch --minmatch 100 --delta ${cluster}.contigs.nucmer.delta ${cluster}.contigs.fa ${cluster}.contigs.fa
899+
paftools.js delta2paf ${cluster}.contigs.nucmer.delta | awk -F "\t" '{if($1 != $6) {print}}' > ${cluster}.contigs.nucmer.paf
900+
901+
902+
##automate the production of an R script using gggenomes to plot the alignment
903+
##then use gggenome with R script to create the plots
904+
Rscriptpath=$( which gggenomes_skeleton.stargraph.R )
905+
cat ${Rscriptpath} | sed "s/CLUSTER/${cluster}/g" | sed "s|PATHTOOUTPUT|${outputpath}/5.SLR_starship_network_alignments|g" > ${cluster}.R
906+
Rscript ${cluster}.R
907+
908+
##remove some of the fasta files
909+
rm ${cluster}.contigs.fa
910+
rm ${cluster}.contigs.nucmer.delta
911+
912+
done
790913

791914

792915
cd ..

0 commit comments

Comments
 (0)