Skip to content

Commit 5e62c9b

Browse files
authored
Update stargraph.sh
1 parent 244c1e2 commit 5e62c9b

File tree

1 file changed

+15
-8
lines changed

1 file changed

+15
-8
lines changed

bin/stargraph.sh

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ length="20000"
6262
kmersize="19"
6363
separator="_"
6464
minsize="30000"
65+
maxsize="2000000"
6566
window="1000"
6667
flank="75000"
6768
poaparam="7919,8069"
@@ -133,6 +134,11 @@ case "$key" in
133134
shift
134135
shift
135136
;;
137+
-x|--maxsize)
138+
minsize="$2"
139+
shift
140+
shift
141+
;;
136142
-w|--window)
137143
window="$2"
138144
shift
@@ -184,6 +190,7 @@ case "$key" in
184190
Optional parameters:
185191
-s | --separator PanSN-spec-like naming separator used (Default: _)
186192
-m | --minsize Minimum size of PAVs to be kept (Default: 30000)
193+
-x | --maxsize Maximum size of SLRs to be kept; filter only applied after starship merging (Default: 2000000)
187194
-w | --window Size of windows used for PAV detection (Default: 1000)
188195
-f | --flank Size of flanking region used when plotting element alignments (Default: 75000)
189196
-p | --prefix Prefix for output (Default: stargraph)
@@ -526,17 +533,17 @@ done > ${cluster}.absent.fa
526533
echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.regions_plus_flank.plotting.bed
527534

528535
##now align the SLR to the empty contigs (here only the flanks should have large aligning regions)
529-
nucmer -t ${threads} --maxmatch --minmatch 100 --delta ${cluster}.regions_plus_flank.absent.nucmer.delta ${cluster}.regions_plus_flank.temp.fa ${cluster}.absent.fa
536+
nucmer -t ${threads} --maxmatch --minmatch 100 --delta ${cluster}.regions_plus_flank.absent.nucmer.delta ${cluster}.regions_plus_flank.temp.fa ${cluster}.absent.fa
530537
paftools.js delta2paf ${cluster}.regions_plus_flank.absent.nucmer.delta > ${cluster}.regions_plus_flank.absent.nucmer.paf
531538

532-
##now use the paf file to find a contigs with good aligning regions
533-
##good aligning can be that at least a single contig with a minimum alignment length of 10kb
534-
##take the best two aligning contigs from the dataset (ideally it'll be large contigs from two different genomes)
539+
##now use the paf file to find a contigs with good aligning regions (in a rather simplistic manner; which we can do as the input assemblies should be long-read)
540+
##good aligning can be that the contig has at least one alignment with a length > 20kb (remove chances of TE content influencing calculcations)
541+
##then take the best two aligning contigs from the dataset (ideally it'll be large contigs from two different genomes)
542+
##to find the 'best'; sum all regions aligning above 10kb in length and take the largest sum (10kb again helps with repeat content)
535543
##then try to find the edges of the alignments using 10kb seeds (this will be used for the plotting; i.e. only this regions alignment visualised)
536-
537-
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v flank="$flank" '{if($11 > (10000) ) print}' | cut -f1 | sort -u | while read tempcontig
544+
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v flank="$flank" '{if($11 > (20000) ) print}' | cut -f1 | sort -u | while read tempcontig
538545
do
539-
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v tempcontig="$tempcontig" '{if($1 == tempcontig) sum=sum+$11} END{print tempcontig"\t"sum}'
546+
cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v tempcontig="$tempcontig" '{if($1 == tempcontig && $11 > 10000) sum=sum+$11} END{print tempcontig"\t"sum}'
540547
done | sort -k2nr | head -n2 | awk '{print $1}' > ${cluster}.absent.contigs.txt
541548

542549
##extract the contigs
@@ -657,7 +664,7 @@ tail -n+2 ${elementspath} | cut -f1,4,6-7 | awk '{print $2"\t"$3"\t"$4"\t"$1}' >
657664

658665

659666
##now do the subtraction from the SLRs and rename if any SLRs were split with multiple sections still remaning
660-
bedtools subtract -a ${prefix}.SLRs.bed -b ${prefix}.starships.bed | sortBed -i - | awk -v minsize="$minsize" '{if($3-$2 >= minsize) print}' | awk '{if(SLR==$4) {print contig"\t"start"\t"end"\t"SLR"_"n ; contig=$1 ; start=$2; end=$3; SLR=$4; line=$0; n++; hold="hold"} else if(SLR != $4 && hold=="hold") {print line"_"n; contig=$1 ; start=$2; end=$3; SLR=$4; line=$0; n=1; hold="nohold"} else {print line; contig=$1 ; start=$2; end=$3; SLR=$4; line=$0; n=1; hold="nohold"}} END{if(hold=="hold") {print line"_"n; contig=$1 ; start=$2; end=$3; SLR=$4; line=$0; n=1; hold="nohold"} else {print line; contig=$1 ; start=$2; end=$3; SLR=$4; line=$0; n=1; hold="nohold"}}' | tail -n+2 > ${prefix}.SLRs.starships_subtracted.bed
667+
bedtools subtract -a ${prefix}.SLRs.bed -b ${prefix}.starships.bed | sortBed -i - | awk -v minsize="$minsize" -v maxsize="$maxsize" '{if(($3-$2) >= minsize && ($3-$2) <= maxsize) print}' | awk '{if(SLR==$4) {print contig"\t"start"\t"end"\t"SLR"_"n ; contig=$1 ; start=$2; end=$3; SLR=$4; line=$0; n++; hold="hold"} else if(SLR != $4 && hold=="hold") {print line"_"n; contig=$1 ; start=$2; end=$3; SLR=$4; line=$0; n=1; hold="nohold"} else {print line; contig=$1 ; start=$2; end=$3; SLR=$4; line=$0; n=1; hold="nohold"}} END{if(hold=="hold") {print line"_"n; contig=$1 ; start=$2; end=$3; SLR=$4; line=$0; n=1; hold="nohold"} else {print line; contig=$1 ; start=$2; end=$3; SLR=$4; line=$0; n=1; hold="nohold"}}' | tail -n+2 > ${prefix}.SLRs.starships_subtracted.bed
661668

662669
##now use this subtracted bedfile to create a starship compatable summary file with captain positions (if present in the new SLR chunk)
663670
echo "SLR;navis-haplotype;contig;start;end;size;captain;captain_start;captain_end;captain_size;captain_sense" | sed 's/;/\t/g' > ${prefix}.SLRs.starships_subtracted.tyrRs.tsv

0 commit comments

Comments
 (0)