Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: Integration tests

on:
push:
branches: [ main ]
pull_request:
branches: [ main]

env:
VCF_VALIDATOR_VERSION: "0.10.2"
NXF_VER: "23.10.0"

jobs:
integration-tests:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4

- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Install Java and Node
run: sudo apt update && sudo apt install -y default-jdk nodejs npm git curl

- name: Install vcf-validator and vcf-assembly-checker
run: |
curl -LJo /usr/local/bin/vcf_validator \
https://github.com/EBIvariation/vcf-validator/releases/download/v${VCF_VALIDATOR_VERSION}/vcf_validator_linux
curl -LJo /usr/local/bin/vcf_assembly_checker \
https://github.com/EBIvariation/vcf-validator/releases/download/v${VCF_VALIDATOR_VERSION}/vcf_assembly_checker_linux
chmod 755 /usr/local/bin/vcf_validator /usr/local/bin/vcf_assembly_checker

- name: Install biovalidator
run: |
git clone https://github.com/elixir-europe/biovalidator.git
cd biovalidator
npm install
sudo npm link

- name: Install Nextflow
run: |
curl -L "https://github.com/nextflow-io/nextflow/releases/download/v${NXF_VER}/nextflow-${NXF_VER}-all" | bash
sudo mv nextflow /usr/local/bin/

- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install pytest
python -m pip install .

- name: Run integration tests
run: |
PYTHONPATH=. pytest tests -m integration
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM --platform=linux/amd64 python:3.10
FROM python:3.10

ENV vcf_validator_version=0.10.2
ENV NXF_VER=23.10.0
Expand Down
3 changes: 2 additions & 1 deletion docs/input_file_overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ alias, which is a shortened identifier you must provide for each analysis.

This is where you describe the biological samples used for your analyses. Each row describes one sample and must include
the Analysis Alias to indicate which analysis it belongs to, and "Sample Name in VCF" which is the exact name of the
sample as it appears in the VCF file.
sample as it appears in the VCF file. If you are submitting a VCF without sample names (containing only allele frequencies),
then you can omit the "Sample Name in VCF" column.

We accept preregistered samples, which should be provided using BioSamples sample or sampleset accessions. Please
ensure these are publicly accessible, as otherwise EVA will not be able to validate them.
Expand Down
3 changes: 1 addition & 2 deletions eva_sub_cli/etc/eva_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,6 @@
{
"required": [
"analysisAlias",
"sampleInVCF",
"bioSampleObject"
]
}
Expand All @@ -334,7 +333,7 @@
},
"sampleInVCF": {
"type": "string",
"description": "Sample Name used in the VCF file"
"description": "Sample Name used in the VCF file. It is a required field when genotypes are provided."
},
"bioSampleAccession": {
"type": "string",
Expand Down
11 changes: 8 additions & 3 deletions eva_sub_cli/executables/check_metadata_semantics.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import argparse
import json

import yaml

from eva_sub_cli.semantic_metadata import SemanticMetadataChecker


def main():
arg_parser = argparse.ArgumentParser(description='Perform semantic checks on the metadata')
arg_parser.add_argument('--metadata_json', required=True, dest='metadata_json', help='EVA metadata json file')
arg_parser.add_argument('--evidence_type_results', required=True, dest='evidence_type_results', help='Results of the evidence check')
arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
help='Path to the location of the results')
args = arg_parser.parse_args()

with open(args.metadata_json) as open_json:
metadata = json.load(open_json)
checker = SemanticMetadataChecker(metadata)
checker.check_all()
checker.write_result_yaml(args.output_yaml)
with open(args.evidence_type_results) as open_yaml:
evidence_type_results = yaml.safe_load(open_yaml)
checker = SemanticMetadataChecker(metadata, evidence_type_results)
checker.check_all()
checker.write_result_yaml(args.output_yaml)
25 changes: 13 additions & 12 deletions eva_sub_cli/executables/samples_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def compare_names_in_files_and_samples(sample_name_in_analysis, sample_name_per_
more_metadata_submitted_files)


def compare_all_analysis(metadata, files_per_analysis):
def compare_all_analysis(metadata, files_per_analysis, evidence_type_results):
overall_differences = False
results_per_analysis_alias = {}
all_analysis_alias = set(metadata.samples_per_analysis) | set(files_per_analysis)
Expand All @@ -65,7 +65,7 @@ def compare_all_analysis(metadata, files_per_analysis):
for file_path in files_per_analysis.get(analysis_alias, [])
}

if need_to_check_samples(sample_name_per_file):
if need_to_check_samples(evidence_type_results, analysis_alias):
(
has_difference, more_per_submitted_files_metadata,
more_submitted_files_metadata, more_metadata_submitted_files
Expand All @@ -89,13 +89,9 @@ def compare_all_analysis(metadata, files_per_analysis):
return overall_differences, results_per_analysis_alias


def need_to_check_samples(sample_name_per_file):
no_samples_in_vcf = all(len(v) == 0 for v in sample_name_per_file.values())
if no_samples_in_vcf:
evidence_types_for_vcf_files = [detect_vcf_evidence_type(vcf_file) for vcf_file in sample_name_per_file.keys()]
if set(evidence_types_for_vcf_files) == {'allele_frequency'}:
return False

def need_to_check_samples(evidence_type_results, analysis_alias):
if evidence_type_results.get(analysis_alias, {}).get('evidence_type') == 'allele_frequency':
return False
return True


Expand All @@ -107,14 +103,16 @@ def write_result_yaml(output_yaml, overall_differences, results_per_analysis_ali
}, stream=open_yaml)


def check_sample_name_concordance(metadata_json, vcf_files, output_yaml):
def check_sample_name_concordance(metadata_json, vcf_files, output_yaml, evidence_type_result_file):
"""
Take the metadata following EVA standard and formatted in JSON then compare the sample names in it to the ones
found in the VCF files
"""
metadata = EvaMetadataJson(metadata_json)
with open(evidence_type_result_file) as open_yaml:
evidence_type_results = yaml.safe_load(open_yaml)
file_path_per_analysis = associate_vcf_path_with_analysis(metadata, vcf_files)
overall_differences, results_per_analysis_alias = compare_all_analysis(metadata, file_path_per_analysis)
overall_differences, results_per_analysis_alias = compare_all_analysis(metadata, file_path_per_analysis, evidence_type_results)
write_result_yaml(output_yaml, overall_differences, results_per_analysis_alias)


Expand All @@ -127,7 +125,10 @@ def main():
help='Path to the vcf files to compare to the metadata')
arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
help='Path to the location of the results')
arg_parser.add_argument('--evidence_type_results', required=True, dest='evidence_type_results',
help='Results of the evidence check')


args = arg_parser.parse_args()
logging_config.add_stdout_handler()
check_sample_name_concordance(args.metadata_json, args.vcf_files, args.output_yaml)
check_sample_name_concordance(args.metadata_json, args.vcf_files, args.output_yaml, args.evidence_type_results)
26 changes: 22 additions & 4 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -105,20 +105,31 @@ workflow {
collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect())

// Task-specific processing
evidence_type_results = null

if (params.tasks.contains(VCF_CHECK)) {
check_vcf_valid(vcf_and_ref_ch)
evidence_type_check(metadata_json, vcf_files.collect())
evidence_type_results = evidence_type_check.out.evidence_type_checker_yml
}
if (params.tasks.contains(ASSEMBLY_CHECK)) {
check_vcf_reference(vcf_and_ref_ch)
insdc_checker(metadata_json, fasta_to_vcfs)
}
if (params.tasks.contains(METADATA_CHECK)) {
if (!evidence_type_results) {
evidence_type_check(metadata_json, vcf_files.collect())
evidence_type_results = evidence_type_check.out.evidence_type_checker_yml
}
metadata_json_validation(metadata_json)
metadata_semantic_check(metadata_json)
metadata_semantic_check(metadata_json, evidence_type_results)
}
if (params.tasks.contains(SAMPLE_CHECK)) {
sample_name_concordance(metadata_json, vcf_files.collect())
if (!evidence_type_results){
evidence_type_check(metadata_json, vcf_files.collect())
evidence_type_results = evidence_type_check.out.evidence_type_checker_yml
}
sample_name_concordance(metadata_json, vcf_files.collect(), evidence_type_results)
}
}

Expand Down Expand Up @@ -294,14 +305,17 @@ process sample_name_concordance {
input:
path(metadata_json)
path(vcf_files)
path(evidence_type_results)

output:
path "sample_checker.yml", emit: sample_checker_yml
path "sample_checker.log", emit: sample_checker_log

script:
"""
$params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml > sample_checker.log 2>&1
$params.python_scripts.samples_checker --metadata_json $metadata_json \
--vcf_files $vcf_files --output_yaml sample_checker.yml \
--evidence_type_results $evidence_type_results > sample_checker.log 2>&1
"""
}

Expand Down Expand Up @@ -362,13 +376,17 @@ process metadata_semantic_check {

input:
path(metadata_json)
path(evidence_type_results)

output:
path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml
path "semantic_checker.log", emit: semantic_checker_log

script:
"""
$params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml > semantic_checker.log 2>&1
$params.python_scripts.semantic_checker \
--metadata_json $metadata_json \
--evidence_type_results $evidence_type_results \
--output_yaml metadata_semantic_check.yml > semantic_checker.log 2>&1
"""
}
16 changes: 15 additions & 1 deletion eva_sub_cli/semantic_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
PARENT_PROJECT_KEY = 'parentProject'
CHILD_PROJECTS_KEY = 'childProjects'
PEER_PROJECTS_KEY = 'peerProjects'
SAMPLE_IN_VCF_KEY = 'sampleInVCF'
BIOSAMPLE_OBJECT_KEY = 'bioSampleObject'
BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession'
CHARACTERISTICS_KEY = 'characteristics'
Expand All @@ -40,9 +41,10 @@ def cast_list(l, type_to_cast=str):

class SemanticMetadataChecker(AppLogger):

def __init__(self, metadata, sample_checklist='ERC000011'):
def __init__(self, metadata, evidence_type_results, sample_checklist='ERC000011'):
self.sample_checklist = sample_checklist
self.metadata = metadata
self.evidence_type_results = evidence_type_results
self.errors = []
# Caches whether taxonomy code is valid or not, and maps to scientific name if valid
self.taxonomy_valid = {}
Expand All @@ -60,6 +62,7 @@ def check_all(self):
self.check_all_analysis_run_accessions()
self.check_analysis_alias_coherence()
self.check_all_analysis_contain_samples()
self.check_all_samples_have_sample_in_vcf()
self.check_hold_date()

def check_hold_date(self):
Expand Down Expand Up @@ -316,3 +319,14 @@ def check_all_analysis_contain_samples(self):
json_path = f'/{ANALYSIS_KEY}/{idx}'
self.add_error(property=json_path,
description=f'No sample found for the analysis. Should have at the least one sample.')

def check_all_samples_have_sample_in_vcf(self):
for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
json_path = f'/{SAMPLE_KEY}/{idx}/{SAMPLE_IN_VCF_KEY}'
analysis_aliases = sample.get(ANALYSIS_ALIAS_KEY, [])
if any([self.evidence_type_results.get(analysis_alias, {}).get('evidence_type') != 'allele_frequency' for
analysis_alias in analysis_aliases]):
# SampleInVCF is required
if sample.get(SAMPLE_IN_VCF_KEY) is None or sample.get(SAMPLE_IN_VCF_KEY) == '':
self.add_error(json_path, f'{SAMPLE_IN_VCF_KEY} must be provided when Genotypes are present in the VCF file')

6 changes: 6 additions & 0 deletions tests/resources/sample_checker/evidence_type.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
VD1:
evidence_type: genotype
VD2:
evidence_type: genotype
VD3:
evidence_type: genotype
9 changes: 9 additions & 0 deletions tests/resources/sample_in_vcf_check/allele_freq.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=1,length=249250621>
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total Allele Count">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele Count">
#CHROM POS ID REF ALT QUAL FILTER INFO
1 10177 rs367896724 A AC 100 PASS AF=0.11;AN=2000;AC=220
1 10505 rs548419688 A T 100 PASS AF=0.09;AN=2000;AC=180
2 changes: 2 additions & 0 deletions tests/resources/sample_in_vcf_check/fake_fasta.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>fasta
AAA
7 changes: 7 additions & 0 deletions tests/resources/sample_in_vcf_check/genotype.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=1,assembly=b37,length=249250621>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3
1 10177 rs367896724 A AC 100 PASS . GT 1|0 0|1 0|0
1 10505 rs548419688 A T 100 PASS . GT 0|0 0|0 0|1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"project": {"title": "Test AF Project", "description": "Project with allele frequency VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["AF1"], "bioSampleAccession": "SAME00001"}], "analysis": [{"analysisTitle": "AF Analysis", "analysisAlias": "AF1", "description": "Allele frequency analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "AF1", "fileName": "allele_freq.vcf", "fileType": "vcf", "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", "fileSize": 458}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"project": {"title": "Test AF Project", "description": "Project with allele frequency VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["AF1"], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001"}], "analysis": [{"analysisTitle": "AF Analysis", "analysisAlias": "AF1", "description": "Allele frequency analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "AF1", "fileName": "allele_freq.vcf", "fileType": "vcf", "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", "fileSize": 458}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"project": {"title": "Test Genotype Project", "description": "Project with genotype VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00001"}, {"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00002"}, {"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00003"}], "analysis": [{"analysisTitle": "Genotype Analysis", "analysisAlias": "GT1", "description": "Genotype analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "GT1", "fileName": "genotype.vcf", "fileType": "vcf", "md5": "81ca0b3a6e5b657bc2be50085c76546a", "fileSize": 350}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"project": {"title": "Test Genotype Project", "description": "Project with genotype VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["GT1"], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001"}, {"analysisAlias": ["GT1"], "sampleInVCF": "sample2", "bioSampleAccession": "SAME00002"}, {"analysisAlias": ["GT1"], "sampleInVCF": "sample3", "bioSampleAccession": "SAME00003"}], "analysis": [{"analysisTitle": "Genotype Analysis", "analysisAlias": "GT1", "description": "Genotype analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "GT1", "fileName": "genotype.vcf", "fileType": "vcf", "md5": "81ca0b3a6e5b657bc2be50085c76546a", "fileSize": 350}]}
Loading
Loading