From 5ec7ec2d66a5172f2fd875f18df0d5dac150f384 Mon Sep 17 00:00:00 2001 From: Joao Machado <11202921@isep.ipp.pt> Date: Fri, 24 Mar 2023 14:45:52 +0000 Subject: [PATCH] change running scripts structure --- run_toil_wf.sh | 48 ++++++++++ run_wf.sh | 219 +--------------------------------------------- util_functions.sh | 189 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 241 insertions(+), 215 deletions(-) create mode 100755 run_toil_wf.sh create mode 100644 util_functions.sh diff --git a/run_toil_wf.sh b/run_toil_wf.sh new file mode 100755 index 00000000..225b4f2d --- /dev/null +++ b/run_toil_wf.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +source util_functions.sh + + +# ----------------------------- running pipeline ----------------------------- # + +# IMPORTANT! +# To work with slurm, add "--batchSystem slurm", "--disableChaining" and "--disableCaching" in the TOIL_PARMS object +TOIL_PARAMS+=( + --singularity + --preserve-entire-environment + --batchSystem slurm + --disableChaining + # --provenance "${PROV_DIR}" + --disableCaching + --logFile "${LOG_DIR}/${NAME}.log" + --jobStore "${JOB_TOIL_FOLDER}/${NAME}" + --outdir "${OUT_DIR_FINAL}" + --maxCores 20 + --defaultMemory "${MEMORY}" + --defaultCores "${NUM_CORES}" + --retryCount 2 + --logDebug + "$CWL" + "$RENAMED_YML" +) + + +# Toil parameters documentation +# --disableChaining Disables chaining of jobs (chaining uses one job's resource allocation for its successor job if possible). +# --preserve-entire-environment Need to propagate the env vars for Singularity, etc., into the HPC jobs +# --disableProgress Disables the progress bar shown when standard error is a terminal. +# --retryCount Number of times to retry a failing job before giving up and labeling job failed. default=1 +# --disableCaching Disables caching in the file store. This flag must be set to use a batch system that does not support caching such as Grid Engine, Parasol, LSF, or Slurm. + + +# Create util directories +mkdir -p ${LOG_DIR} +mkdir -p ${JOB_TOIL_FOLDER} +mkdir ${RUN_DIR}/tmp_dir/ +mkdir ${RUN_DIR}/tmp_out_dir/ + + +toil-cwl-runner "${TOIL_PARAMS[@]}" + + + diff --git a/run_wf.sh b/run_wf.sh index abf35ed8..34c2a8cd 100755 --- a/run_wf.sh +++ b/run_wf.sh @@ -1,226 +1,15 @@ #!/bin/bash -# default values # -SCRIPT_PATH=$(realpath "$0") -PIPELINE_DIR=$(dirname "${SCRIPT_PATH}") -MEMORY=10G -NUM_CORES=1 -LIMIT_QUEUE=100 -YML="${PIPELINE_DIR}/Installation/templates/default.yml" -DB_DIR="${PIPELINE_DIR}/ref-dbs/" -TOOLS="${PIPELINE_DIR}/tools/" - -_usage() { - echo " -metaGOflow interface. -Script arguments. - Resources: - -m Memory to use to with toil --defaultMemory. (optional, default ${MEMORY}) - -c Number of cpus to use with toil --defaultCores. (optional, default ${NUM_CORES}) - -l Limit number of jobs to schedule. (optional, default ${LIMIT_QUEUE}) - - Pipeline parameters: - -y template yml file. (optional, default ../templates/rna_prediction_template.yml}) - -f Forward reads fasta file path. - -r Reverse reads fasta file path. - -n Name of run and prefix to output files. - -d Path to run directory. - -s Run workflow using Singularity (docker is the by default container technology) ('true' or 'false') -" -} - -while getopts :y:f:r:e:u:k:c:d:m:n:l:sph option; do - case "${option}" in - y) YML=${OPTARG} ;; - f) - FORWARD_READS=${OPTARG} - echo "Presented paired-end forward read: ${FORWARD_READS}" - ;; - r) - REVERSE_READS=${OPTARG} - printf "Presented paired-end reverse path: ${REVERSE_READS}\n" - ;; - e) ENA_RUN_ID=${OPTARG} ;; - u) ENA_USERNAME=${OPTARG} ;; - k) ENA_PASSWORD=${OPTARG} ;; - c) NUM_CORES=${OPTARG} ;; - d) RUN_DIR=${OPTARG} ;; - m) MEMORY=${OPTARG} ;; - n) NAME=${OPTARG} ;; - l) LIMIT_QUEUE=${OPTARG} ;; - s) SINGULARITY="--singularity" ;; - p) PRIVATE_DATA="-p" ;; - h) - _usage - exit 0 - ;; - :) - usage - exit 1 - ;; - \?) - echo "" - echo "Invalid option -${OPTARG}" >&2 - _usage - exit 1 - ;; - esac -done - -# ----------------------------- sanity check arguments ----------------------------- # - -_check_mandatory() { - # Check if the argument is empty or null - # $1 variable - # $2 name to show - if [ -z "$1" ]; then - echo "Error." >&2 - echo "Option ${2} is mandatory " >&2 - echo "type -h to get help" - exit 1 - fi -} - -_check_reads() { - #check forward and reverse reads both present - #check if single reads then no other readsgiven - - # BASH SYNTAX: - # to check if a variable has value: - # [ -z "$var" ] && echo "Empty" - - if [ -z "$1" ] && [ -n "$2" ]; then - echo "Error" - echo "only reverse reads given, provide forward with -f" - exit 1 - fi - - if [ -n "$1" ] && [ -z "$2" ]; then - echo "Error" - echo "only forward reads given, provide reverse with -r" - exit 1 - fi - -} - -_check_mandatory "$NAME" "-n" -_check_mandatory "$RUN_DIR" "-d" -_check_reads "$FORWARD_READS" "$REVERSE_READS" - - - -# ----------------------------- environment & variables ----------------------------- # - -# load required environments and packages before running - -export TOIL_SLURM_ARGS="--array=1-${LIMIT_QUEUE}%20" #schedule 100 jobs 20 running at one time -export CWL="${PIPELINE_DIR}/workflows/gos_wf.cwl" - -# work dir -export WORK_DIR=${RUN_DIR}/work-dir -export JOB_TOIL_FOLDER=${WORK_DIR}/job-store-wf -export TMPDIR=${RUN_DIR}/tmp - -# result dir -export OUT_DIR=${RUN_DIR} -export LOG_DIR=${OUT_DIR}/log-dir/${NAME} -export OUT_DIR_FINAL=${OUT_DIR}/results -export PROV_DIR=${OUT_DIR}/prov -export CACHE_DIR=${OUT_DIR}/cache -mkdir -p "${OUT_DIR_FINAL}" "${TMPDIR}" "${PROV_DIR}" - #"${CACHE_DIR}" ${JOB_TOIL_FOLDER}" "${LOG_DIR}" - -export RENAMED_YML_TMP=${RUN_DIR}/"${NAME}"_temp.yml -export RENAMED_YML=${RUN_DIR}/"${NAME}".yml - -# Get study id in case of ENA fetch tool -if [[ $ENA_RUN_ID != "" ]]; -then - - # Run cwl for the ENA fetch tool - cp tools/fetch-tool/get_raw_data_run.cwl . - - printf " - run_accession_number: ${ENA_RUN_ID} - private_data: true - ena_api_username: ${ENA_USERNAME} - ena_api_password: ${ENA_PASSWORD} - " > get_raw_data_run-test.yml - - cwl-runner ${SINGULARITY} --outdir ${OUT_DIR} --debug get_raw_data_run.cwl get_raw_data_run-test.yml - - rm get_raw_data_run.cwl - rm get_raw_data_run-test.yml - - # Get the accession id of the corresponding study - ENA_STUDY_ID=$(curl -X POST "https://www.ebi.ac.uk/ena/browser/api/xml?accessions="$ENA_RUN_ID"&expanded=true" \ - -H "accept: application/xml" | grep -A 1 "ENA-STUDY" | tail -1 | sed 's/.*// ; s/<\/ID>//') - - export PATH_ENA_RAW_DATA=${PIPELINE_DIR}/${OUT_DIR}/raw_data_from_ENA/${ENA_STUDY_ID}/raw/ - - -fi - - -# ----------------------------- prepare yml file ----------------------------- # - -echo "Writing yaml file" - -# DO NOT leave spaces after "\" in the end of a line -python3 create_yml.py \ - -y "${YML}" \ - -o "${RENAMED_YML_TMP}" \ - -l "${PATH_ENA_RAW_DATA}" \ - -f "${PIPELINE_DIR}/${FORWARD_READS}" \ - -r "${PIPELINE_DIR}/${REVERSE_READS}" \ - -d "${DB_DIR}" \ - -t "${TOOLS}" \ - -e "${ENA_RUN_ID}" - -mv eosc-wf.yml ${RUN_DIR}/ -cat ${RUN_DIR}/eosc-wf.yml ${RENAMED_YML_TMP} > ${RENAMED_YML} -rm ${RENAMED_YML_TMP} -rm ${RUN_DIR}/eosc-wf.yml +source util_functions.sh # ----------------------------- running pipeline ----------------------------- # -# IMPORTANT! -# To work with slurm, add "--batchSystem slurm", "--disableChaining" and "--disableCaching" in the TOIL_PARMS object -TOIL_PARAMS+=( - --singularity - --preserve-entire-environment - --batchSystem slurm - --disableChaining - # --provenance "${PROV_DIR}" - --disableCaching - --logFile "${LOG_DIR}/${NAME}.log" - --jobStore "${JOB_TOIL_FOLDER}/${NAME}" - --outdir "${OUT_DIR_FINAL}" - --maxCores 20 - --defaultMemory "${MEMORY}" - --defaultCores "${NUM_CORES}" - --retryCount 2 - --logDebug - "$CWL" - "$RENAMED_YML" -) - -# Toil parameters documentation -# --disableChaining Disables chaining of jobs (chaining uses one job's resource allocation for its successor job if possible). -# --preserve-entire-environment Need to propagate the env vars for Singularity, etc., into the HPC jobs -# --disableProgress Disables the progress bar shown when standard error is a terminal. -# --retryCount Number of times to retry a failing job before giving up and labeling job failed. default=1 -# --disableCaching Disables caching in the file store. This flag must be set to use a batch system that does not support caching such as Grid Engine, Parasol, LSF, or Slurm. +cwltool ${SINGULARITY} --provenance ${PROV_DIR} --outdir ${OUT_DIR_FINAL} ${CWL} ${RENAMED_YML} - -# # COMMENT IN TO RUN THE TOIL VERSION -# echo "toil-cwl-runner" "${TOIL_PARAMS[@]}" -# toil-cwl-runner "${TOIL_PARAMS[@]}" - - -cwltool --debug ${SINGULARITY} --provenance ${PROV_DIR} --outdir ${OUT_DIR_FINAL} ${CWL} ${RENAMED_YML} +#cwltool --debug ${SINGULARITY} --provenance ${PROV_DIR} --outdir ${OUT_DIR_FINAL} ${CWL} ${RENAMED_YML} # --cachedir ${CACHE_DIR} # --leave-tmpdir --leave-outputs + diff --git a/util_functions.sh b/util_functions.sh new file mode 100644 index 00000000..f840547b --- /dev/null +++ b/util_functions.sh @@ -0,0 +1,189 @@ +# default values # +SCRIPT_PATH=$(realpath "$0") +PIPELINE_DIR=$(dirname "${SCRIPT_PATH}") +MEMORY=10G +NUM_CORES=1 +LIMIT_QUEUE=100 +YML="${PIPELINE_DIR}/Installation/templates/default.yml" +DB_DIR="${PIPELINE_DIR}/ref-dbs/" +TOOLS="${PIPELINE_DIR}/tools/" + +_usage() { + echo " +metaGOflow interface. +Script arguments. + Resources: + -m Memory to use to with toil --defaultMemory. (optional, default ${MEMORY}) + -c Number of cpus to use with toil --defaultCores. (optional, default ${NUM_CORES}) + -l Limit number of jobs to schedule. (optional, default ${LIMIT_QUEUE}) + + Pipeline parameters: + -y template yml file. (optional, default ../templates/rna_prediction_template.yml}) + -f Forward reads fasta file path. + -r Reverse reads fasta file path. + -n Name of run and prefix to output files. + -d Path to run directory. + -s Run workflow using Singularity (docker is the by default container technology) ('true' or 'false') +" +} + +while getopts :y:f:r:e:u:k:c:d:m:n:l:sph option; do + case "${option}" in + y) YML=${OPTARG} ;; + f) + FORWARD_READS=${OPTARG} + echo "Presented paired-end forward read: ${FORWARD_READS}" + ;; + r) + REVERSE_READS=${OPTARG} + printf "Presented paired-end reverse path: ${REVERSE_READS}\n" + ;; + e) ENA_RUN_ID=${OPTARG} ;; + u) ENA_USERNAME=${OPTARG} ;; + k) ENA_PASSWORD=${OPTARG} ;; + c) NUM_CORES=${OPTARG} ;; + d) RUN_DIR=${OPTARG} ;; + m) MEMORY=${OPTARG} ;; + n) NAME=${OPTARG} ;; + l) LIMIT_QUEUE=${OPTARG} ;; + s) SINGULARITY="--singularity" ;; + p) PRIVATE_DATA="-p" ;; + h) + _usage + exit 0 + ;; + :) + usage + exit 1 + ;; + \?) + echo "" + echo "Invalid option -${OPTARG}" >&2 + _usage + exit 1 + ;; + esac +done + +# ----------------------------- sanity check arguments ----------------------------- # + +_check_mandatory() { + # Check if the argument is empty or null + # $1 variable + # $2 name to show + if [ -z "$1" ]; then + echo "Error." >&2 + echo "Option ${2} is mandatory " >&2 + echo "type -h to get help" + exit 1 + fi +} + +_check_reads() { + #check forward and reverse reads both present + #check if single reads then no other readsgiven + + # BASH SYNTAX: + # to check if a variable has value: + # [ -z "$var" ] && echo "Empty" + + if [ -z "$1" ] && [ -n "$2" ]; then + echo "Error" + echo "only reverse reads given, provide forward with -f" + exit 1 + fi + + if [ -n "$1" ] && [ -z "$2" ]; then + echo "Error" + echo "only forward reads given, provide reverse with -r" + exit 1 + fi + +} + +_check_mandatory "$NAME" "-n" +_check_mandatory "$RUN_DIR" "-d" +_check_reads "$FORWARD_READS" "$REVERSE_READS" + + + +# ----------------------------- environment & variables ----------------------------- # + +# load required environments and packages before running + +export TOIL_SLURM_ARGS="--array=1-${LIMIT_QUEUE}%20" #schedule 100 jobs 20 running at one time +export CWL="${PIPELINE_DIR}/workflows/gos_wf.cwl" + +# work dir +export WORK_DIR=${RUN_DIR}/work-dir +export JOB_TOIL_FOLDER=${WORK_DIR}/job-store-wf +export TMPDIR=${RUN_DIR}/tmp + +# result dir +export OUT_DIR=${RUN_DIR} +export LOG_DIR=${OUT_DIR}/log-dir/${NAME} +export OUT_DIR_FINAL=${OUT_DIR}/results +export PROV_DIR=${OUT_DIR}/prov +export CACHE_DIR=${OUT_DIR}/cache +mkdir -p "${OUT_DIR_FINAL}" "${TMPDIR}" "${PROV_DIR}" + #"${CACHE_DIR}" ${JOB_TOIL_FOLDER}" "${LOG_DIR}" + +export RENAMED_YML_TMP=${RUN_DIR}/"${NAME}"_temp.yml +export RENAMED_YML=${RUN_DIR}/"${NAME}".yml + +# Get study id in case of ENA fetch tool +if [[ $ENA_RUN_ID != "" ]]; +then + + # Run cwl for the ENA fetch tool + cp tools/fetch-tool/get_raw_data_run.cwl . + + printf " + run_accession_number: ${ENA_RUN_ID} + private_data: true + ena_api_username: ${ENA_USERNAME} + ena_api_password: ${ENA_PASSWORD} + " > get_raw_data_run-test.yml + + cwl-runner ${SINGULARITY} --outdir ${OUT_DIR} --debug get_raw_data_run.cwl get_raw_data_run-test.yml + + rm get_raw_data_run.cwl + rm get_raw_data_run-test.yml + + # Get the accession id of the corresponding study + ENA_STUDY_ID=$(curl -X POST "https://www.ebi.ac.uk/ena/browser/api/xml?accessions="$ENA_RUN_ID"&expanded=true" \ + -H "accept: application/xml" | grep -A 1 "ENA-STUDY" | tail -1 | sed 's/.*// ; s/<\/ID>//') + + export PATH_ENA_RAW_DATA=${PIPELINE_DIR}/${OUT_DIR}/raw_data_from_ENA/${ENA_STUDY_ID}/raw/ + + +fi + + +# ----------------------------- prepare yml file ----------------------------- # + +echo "Writing yaml file" + +echo "${PIPELINE_DIR}/${FORWARD_READS}" + +echo "only reads" +echo "${FORWARD_READS}" + + +# DO NOT leave spaces after "\" in the end of a line +python3 create_yml.py \ + -y "${YML}" \ + -o "${RENAMED_YML_TMP}" \ + -l "${PATH_ENA_RAW_DATA}" \ + -f "${PIPELINE_DIR}/${FORWARD_READS}" \ + -r "${PIPELINE_DIR}/${REVERSE_READS}" \ + -d "${DB_DIR}" \ + -t "${TOOLS}" \ + -e "${ENA_RUN_ID}" + +echo "done" + +mv eosc-wf.yml ${RUN_DIR}/ +cat ${RUN_DIR}/eosc-wf.yml ${RENAMED_YML_TMP} > ${RENAMED_YML} +rm ${RENAMED_YML_TMP} +rm ${RUN_DIR}/eosc-wf.yml