|
459 | 459 | cat ../2.PAVs_to_SLRs/${prefix}.SLRs.plus_clusters.tsv | awk -v cluster="$cluster" '{if($5 == cluster) {print $1}}' > ${cluster}.list.txt |
460 | 460 |
|
461 | 461 | ##get the genomes missed these SLRs |
462 | | -cat ../2.PAVs_to_SLRs/${prefix}.SLRs.plus_clusters.tsv | awk -v cluster="$cluster" '{if($5 == cluster) print}' | while read line |
| 462 | +cat ../2.PAVs_to_SLRs/${prefix}.SLRs.plus_clusters.tsv | awk -F "\t" -v cluster="$cluster" '{if($5 == cluster) print}' | while read line |
463 | 463 | do |
464 | | -SLR=$( echo "${line}" | awk '{print $1}' ) |
465 | | -position=$( echo "${line}" | awk '{print $2"\t"$3"\t"$4}' ) |
| 464 | +SLR=$( echo "${line}" | awk -F "\t" '{print $1}' ) |
| 465 | +position=$( echo "${line}" | awk -F "\t" '{print $2"\t"$3"\t"$4}' ) |
466 | 466 | cat ../1.${prefix}.pggb/${prefix}.PAVs.${minsize2}kb_min.tsv | grep "${position}" | awk '{print $4}' | tr ';' '\n' | while read absent |
467 | 467 | do |
468 | 468 | cat ${cluster}.list.txt | awk -F "${separator}" -v absent="$absent" -v hold="absent" '{if($1 == absent) {hold="present"}} END{if(hold=="absent") print absent}' |
469 | | -done > ${cluster}.absent.txt |
470 | | -done |
| 469 | +done |
| 470 | +done | sort -u > ${cluster}.absent.txt |
471 | 471 |
|
472 | 472 | ###extract the SLRs PLUS the flanking regions around them into a single fasta for alignment (this will be used to identify a good region to visualise the insertion) |
473 | 473 | ##also extract the full contigs in which the SLRs are found (this will be used for the actual alignment) |
474 | | -if [ -f ${cluster}.regions_plus_flank.fa ] |
475 | | -then |
476 | | -rm ${cluster}.regions_plus_flank.fa |
477 | | -fi |
478 | | - |
479 | | -if [ -f ${cluster}.contigs.fa ] |
480 | | -then |
481 | | -rm ${cluster}.contigs.fa |
482 | | -fi |
483 | 474 |
|
484 | 475 | echo "contig;start;end;SLR" | tr ';' '\t' > ${cluster}.regions_plus_flank.tsv |
485 | 476 | cat ${cluster}.list.txt | while read SLR |
@@ -539,26 +530,23 @@ paftools.js delta2paf ${cluster}.regions_plus_flank.absent.nucmer.delta > ${clus |
539 | 530 | ##good aligning can be that at least a single contig has a single alignment larger than half the flank region |
540 | 531 | ##take the best two aligning contigs from the dataset (ideally it'll be large contigs from two different genomes) |
541 | 532 | ##then try to find the edges of the alignments using 20kb seeds (this will be used for the plotting; i.e. only this regions alignment visualised) |
542 | | -if [ -f ${cluster}.absent.contigs.fa ] |
543 | | -then |
544 | | -rm ${cluster}.absent.contigs.fa |
545 | | -fi |
546 | | -cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -v flank="${flank}" '{if($11 > (flank/2) ) print}' | cut -f1 | sort -u | while read tempcontig |
| 533 | + |
| 534 | +cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v flank="${flank}" '{if($11 > (flank/2) ) print}' | cut -f1 | sort -u | while read tempcontig |
547 | 535 | do |
548 | | -cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -v tempcontig="$tempcontig" '{if($1 == tempcontig) sum=sum+$11} END{print tempcontig"\t"sum}' |
| 536 | +cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" -v tempcontig="$tempcontig" '{if($1 == tempcontig) sum=sum+$11} END{print tempcontig"\t"sum}' |
549 | 537 | done | sort -k2nr | head -n2 | awk '{print $1}' | while read insertioncontig |
550 | 538 | do |
551 | 539 | strain=$( echo "${insertioncontig}" | awk -F "${separator}" '{print $1}' ) |
552 | | -edges=$( cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk '{if($11 > 20000) print}' | awk -v insertioncontig="$insertioncontig" '{if($1==insertioncontig) print}' | awk -v flank="$flank" 'BEGIN{max=0; min=99999999999999} {if($4 > max) {max=$4}; if($3 < min) {min=$3}} END{print min-(flank/2)"\t"max+(flank/2)}' | awk '{if($1 < 0) {print 0"\t"$2} else {print}}' ) |
553 | | -edges2=$( echo "${edges}" | awk '{print $1"-"$2}' ) |
554 | | -size=$( echo "${edges}" | awk '{print $2-$1}' ) |
| 540 | +edges=$( cat ${cluster}.regions_plus_flank.absent.nucmer.paf | awk -F "\t" '{if($11 > 20000) print}' | awk -F "\t" -v insertioncontig="$insertioncontig" '{if($1==insertioncontig) print}' | awk -v flank="$flank" 'BEGIN{max=0; min=99999999999999} {if($4 > max) {max=$4}; if($3 < min) {min=$3}} END{print min-(flank/2)"\t"max+(flank/2)}' | awk -F "\t" '{if($1 < 0) {print 0"\t"$2} else {print}}' ) |
| 541 | +edges2=$( echo "${edges}" | awk -F "\t" '{print $1"-"$2}' ) |
| 542 | +size=$( echo "${edges}" | awk -F "\t" '{print $2-$1}' ) |
555 | 543 | samtools faidx ${cluster}.absent.fa "${insertioncontig}" >> ${cluster}.absent.contigs.fa |
556 | 544 | echo "${insertioncontig};${edges};NA" | tr ';' '\t' >> ${cluster}.regions_plus_flank.plotting.bed |
557 | 545 | done |
558 | 546 |
|
559 | 547 | ##add the coordinates etc for the SLRs to the same bed file (doing it in this order so in the plotting the absent regions will be on top) |
560 | 548 | ##then adding the topSLR first so that it will be plotted downstream and show the insertion site for this element |
561 | | -tail -n+2 ${cluster}.regions_plus_flank.tsv | awk -v topSLR="$topSLR" '{if($4 == topSLR) {print $1"\t"$2"\t"$3"\t"$4}}' >> ${cluster}.regions_plus_flank.plotting.bed |
| 549 | +tail -n+2 ${cluster}.regions_plus_flank.tsv | awk -F "\t" -v topSLR="$topSLR" '{if($4 == topSLR) {print $1"\t"$2"\t"$3"\t"$4}}' >> ${cluster}.regions_plus_flank.plotting.bed |
562 | 550 |
|
563 | 551 | ##here ideally the order of the next SLRs should be based on the largest similarity stepwise |
564 | 552 | ## can use ../2.PAVs_to_SLRs/${prefix}.SLRs.sig.pairwise.tsv |
|
0 commit comments