Skip to content

Commit a34dbe1

Browse files
authored
Update stargraph.sh
1 parent 8bb4f18 commit a34dbe1

File tree

1 file changed

+12
-24
lines changed

1 file changed

+12
-24
lines changed

bin/stargraph.sh

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -459,27 +459,18 @@ do
459459
cat ../2.PAVs_to_SLRs/${prefix}.SLRs.plus_clusters.tsv | awk -v cluster="$cluster" '{if($5 == cluster) {print $1}}' > ${cluster}.list.txt
460460

461461
##get the genomes missed these SLRs
462-
cat ../2.PAVs_to_SLRs/${prefix}.SLRs.plus_clusters.tsv | awk -v cluster="$cluster" '{if($5 == cluster) print}' | while read line
462+
cat ../2.PAVs_to_SLRs/${prefix}.SLRs.plus_clusters.tsv | awk -F "\t" -v cluster="$cluster" '{if($5 == cluster) print}' | while read line
463463
do
464-
SLR=$( echo "${line}" | awk '{print $1}' )
465-
position=$( echo "${line}" | awk '{print $2"\t"$3"\t"$4}' )
464+
SLR=$( echo "${line}" | awk -F "\t" '{print $1}' )
465+
position=$( echo "${line}" | awk -F "\t" '{print $2"\t"$3"\t"$4}' )
466466
cat ../1.${prefix}.pggb/${prefix}.PAVs.${minsize2}kb_min.tsv | grep "${position}" | awk '{print $4}' | tr ';' '\n' | while read absent
467467
do
468468
cat ${cluster}.list.txt | awk -F "${separator}" -v absent="$absent" -v hold="absent" '{if($1 == absent) {hold="present"}} END{if(hold=="absent") print absent}'
469-
done > ${cluster}.absent.txt
470-
done
469+
done
470+
done | sort -u > ${cluster}.absent.txt
471471

472472
###extract the SLRs PLUS the flanking regions around them into a single fasta for alignment (this will be used to identify a good region to visualise the insertion)
473473
##also extract the full contigs in which the SLRs are found (this will be used for the actual alignment)
474-
if [ -f ${cluster}.regions_plus_flank.fa ]
475-
then
476-
rm ${cluster}.regions_plus_flank.fa
477-
fi
478-
479-
if [ -f ${cluster}.contigs.fa ]
480-
then
481-
rm ${cluster}.contigs.fa
482-
fi
483474

484475
echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.regions_plus_flank.tsv
485476
cat ${cluster}.list.txt | while read SLR
@@ -539,26 +530,23 @@ paftools.js delta2paf ${cluster}.regions_plus_flank.absent.nucmer.delta > ${clus
539530
##good aligning can be that at least a single contig has a single alignment larger than half the flank region
540531
##take the best two aligning contigs from the dataset (ideally it'll be large contigs from two different genomes)
541532
##then try to find the edges of the alignments using 20kb seeds (this will be used for the plotting; i.e. only this regions alignment visualised)
542-
if [ -f ${cluster}.absent.contigs.fa ]
543-
then
544-
rm ${cluster}.absent.contigs.fa
545-
fi
546-
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -v flank="${flank}" '{if($11 > (flank/2) ) print}' | cut -f1 | sort -u | while read tempcontig
533+
534+
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v flank="${flank}" '{if($11 > (flank/2) ) print}' | cut -f1 | sort -u | while read tempcontig
547535
do
548-
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -v tempcontig="$tempcontig" '{if($1 == tempcontig) sum=sum+$11} END{print tempcontig"\t"sum}'
536+
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v tempcontig="$tempcontig" '{if($1 == tempcontig) sum=sum+$11} END{print tempcontig"\t"sum}'
549537
done | sort -k2nr | head -n2 | awk '{print $1}' | while read insertioncontig
550538
do
551539
strain=$( echo "${insertioncontig}" | awk -F "${separator}" '{print $1}' )
552-
edges=$( cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk '{if($11 > 20000) print}' | awk -v insertioncontig="$insertioncontig" '{if($1==insertioncontig) print}' | awk -v flank="$flank" 'BEGIN{max=0; min=99999999999999} {if($4 > max) {max=$4}; if($3 < min) {min=$3}} END{print min-(flank/2)"\t"max+(flank/2)}' | awk '{if($1 < 0) {print 0"\t"$2} else {print}}' )
553-
edges2=$( echo "${edges}" | awk '{print $1"-"$2}' )
554-
size=$( echo "${edges}" | awk '{print $2-$1}' )
540+
edges=$( cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" '{if($11 > 20000) print}' | awk -F "\t" -v insertioncontig="$insertioncontig" '{if($1==insertioncontig) print}' | awk -v flank="$flank" 'BEGIN{max=0; min=99999999999999} {if($4 > max) {max=$4}; if($3 < min) {min=$3}} END{print min-(flank/2)"\t"max+(flank/2)}' | awk -F "\t" '{if($1 < 0) {print 0"\t"$2} else {print}}' )
541+
edges2=$( echo "${edges}" | awk -F "\t" '{print $1"-"$2}' )
542+
size=$( echo "${edges}" | awk -F "\t" '{print $2-$1}' )
555543
samtools faidx ${cluster}.absent.fa "${insertioncontig}" >> ${cluster}.absent.contigs.fa
556544
echo "${insertioncontig};${edges};NA" | tr ';' '\t' >> ${cluster}.regions_plus_flank.plotting.bed
557545
done
558546

559547
##add the coordinates etc for the SLRs to the same bed file (doing it in this order so in the plotting the absent regions will be on top)
560548
##then adding the topSLR first so that it will be plotted downstream and show the insertion site for this element
561-
tail -n+2 ${cluster}.regions_plus_flank.tsv | awk -v topSLR="$topSLR" '{if($4 == topSLR) {print $1"\t"$2"\t"$3"\t"$4}}' >> ${cluster}.regions_plus_flank.plotting.bed
549+
tail -n+2 ${cluster}.regions_plus_flank.tsv | awk -F "\t" -v topSLR="$topSLR" '{if($4 == topSLR) {print $1"\t"$2"\t"$3"\t"$4}}' >> ${cluster}.regions_plus_flank.plotting.bed
562550

563551
##here ideally the order of the next SLRs should be based on the largest similarity stepwise
564552
## can use ../2.PAVs_to_SLRs/${prefix}.SLRs.sig.pairwise.tsv

0 commit comments

Comments
 (0)