diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml new file mode 100644 index 00000000..11073128 --- /dev/null +++ b/.github/workflows/integration-tests.yml @@ -0,0 +1,56 @@ +name: Integration tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main] + +env: + VCF_VALIDATOR_VERSION: "0.10.2" + NXF_VER: "23.10.0" + +jobs: + integration-tests: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install Java and Node + run: sudo apt update && sudo apt install -y default-jdk nodejs npm git curl + + - name: Install vcf-validator and vcf-assembly-checker + run: | + curl -LJo /usr/local/bin/vcf_validator \ + https://github.com/EBIvariation/vcf-validator/releases/download/v${VCF_VALIDATOR_VERSION}/vcf_validator_linux + curl -LJo /usr/local/bin/vcf_assembly_checker \ + https://github.com/EBIvariation/vcf-validator/releases/download/v${VCF_VALIDATOR_VERSION}/vcf_assembly_checker_linux + chmod 755 /usr/local/bin/vcf_validator /usr/local/bin/vcf_assembly_checker + + - name: Install biovalidator + run: | + git clone https://github.com/elixir-europe/biovalidator.git + cd biovalidator + npm install + sudo npm link + + - name: Install Nextflow + run: | + curl -L "https://github.com/nextflow-io/nextflow/releases/download/v${NXF_VER}/nextflow-${NXF_VER}-all" | bash + sudo mv nextflow /usr/local/bin/ + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + python -m pip install . + + - name: Run integration tests + run: | + PYTHONPATH=. pytest tests -m integration diff --git a/docker/Dockerfile b/docker/Dockerfile index 1233e0c5..b8c054d3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 python:3.10 +FROM python:3.10 ENV vcf_validator_version=0.10.2 ENV NXF_VER=23.10.0 diff --git a/docs/input_file_overview.md b/docs/input_file_overview.md index a52399ab..c2f302e2 100644 --- a/docs/input_file_overview.md +++ b/docs/input_file_overview.md @@ -98,7 +98,8 @@ alias, which is a shortened identifier you must provide for each analysis. This is where you describe the biological samples used for your analyses. Each row describes one sample and must include the Analysis Alias to indicate which analysis it belongs to, and "Sample Name in VCF" which is the exact name of the -sample as it appears in the VCF file. +sample as it appears in the VCF file. If you are submitting a VCF without sample names (containing only allele frequencies), +then you can omit the "Sample Name in VCF" column. We accept preregistered samples, which should be provided using BioSamples sample or sampleset accessions. Please ensure these are publicly accessible, as otherwise EVA will not be able to validate them. diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json index 83559c6c..4e28906f 100644 --- a/eva_sub_cli/etc/eva_schema.json +++ b/eva_sub_cli/etc/eva_schema.json @@ -318,7 +318,6 @@ { "required": [ "analysisAlias", - "sampleInVCF", "bioSampleObject" ] } @@ -334,7 +333,7 @@ }, "sampleInVCF": { "type": "string", - "description": "Sample Name used in the VCF file" + "description": "Sample Name used in the VCF file. It is a required field when genotypes are provided." }, "bioSampleAccession": { "type": "string", diff --git a/eva_sub_cli/executables/check_metadata_semantics.py b/eva_sub_cli/executables/check_metadata_semantics.py index 68bf75e8..5ed9dbc4 100644 --- a/eva_sub_cli/executables/check_metadata_semantics.py +++ b/eva_sub_cli/executables/check_metadata_semantics.py @@ -1,18 +1,23 @@ import argparse import json +import yaml + from eva_sub_cli.semantic_metadata import SemanticMetadataChecker def main(): arg_parser = argparse.ArgumentParser(description='Perform semantic checks on the metadata') arg_parser.add_argument('--metadata_json', required=True, dest='metadata_json', help='EVA metadata json file') + arg_parser.add_argument('--evidence_type_results', required=True, dest='evidence_type_results', help='Results of the evidence check') arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml', help='Path to the location of the results') args = arg_parser.parse_args() with open(args.metadata_json) as open_json: metadata = json.load(open_json) - checker = SemanticMetadataChecker(metadata) - checker.check_all() - checker.write_result_yaml(args.output_yaml) + with open(args.evidence_type_results) as open_yaml: + evidence_type_results = yaml.safe_load(open_yaml) + checker = SemanticMetadataChecker(metadata, evidence_type_results) + checker.check_all() + checker.write_result_yaml(args.output_yaml) diff --git a/eva_sub_cli/executables/samples_checker.py b/eva_sub_cli/executables/samples_checker.py index 1627fdf7..c9823caf 100644 --- a/eva_sub_cli/executables/samples_checker.py +++ b/eva_sub_cli/executables/samples_checker.py @@ -54,7 +54,7 @@ def compare_names_in_files_and_samples(sample_name_in_analysis, sample_name_per_ more_metadata_submitted_files) -def compare_all_analysis(metadata, files_per_analysis): +def compare_all_analysis(metadata, files_per_analysis, evidence_type_results): overall_differences = False results_per_analysis_alias = {} all_analysis_alias = set(metadata.samples_per_analysis) | set(files_per_analysis) @@ -65,7 +65,7 @@ def compare_all_analysis(metadata, files_per_analysis): for file_path in files_per_analysis.get(analysis_alias, []) } - if need_to_check_samples(sample_name_per_file): + if need_to_check_samples(evidence_type_results, analysis_alias): ( has_difference, more_per_submitted_files_metadata, more_submitted_files_metadata, more_metadata_submitted_files @@ -89,13 +89,9 @@ def compare_all_analysis(metadata, files_per_analysis): return overall_differences, results_per_analysis_alias -def need_to_check_samples(sample_name_per_file): - no_samples_in_vcf = all(len(v) == 0 for v in sample_name_per_file.values()) - if no_samples_in_vcf: - evidence_types_for_vcf_files = [detect_vcf_evidence_type(vcf_file) for vcf_file in sample_name_per_file.keys()] - if set(evidence_types_for_vcf_files) == {'allele_frequency'}: - return False - +def need_to_check_samples(evidence_type_results, analysis_alias): + if evidence_type_results.get(analysis_alias, {}).get('evidence_type') == 'allele_frequency': + return False return True @@ -107,14 +103,16 @@ def write_result_yaml(output_yaml, overall_differences, results_per_analysis_ali }, stream=open_yaml) -def check_sample_name_concordance(metadata_json, vcf_files, output_yaml): +def check_sample_name_concordance(metadata_json, vcf_files, output_yaml, evidence_type_result_file): """ Take the metadata following EVA standard and formatted in JSON then compare the sample names in it to the ones found in the VCF files """ metadata = EvaMetadataJson(metadata_json) + with open(evidence_type_result_file) as open_yaml: + evidence_type_results = yaml.safe_load(open_yaml) file_path_per_analysis = associate_vcf_path_with_analysis(metadata, vcf_files) - overall_differences, results_per_analysis_alias = compare_all_analysis(metadata, file_path_per_analysis) + overall_differences, results_per_analysis_alias = compare_all_analysis(metadata, file_path_per_analysis, evidence_type_results) write_result_yaml(output_yaml, overall_differences, results_per_analysis_alias) @@ -127,7 +125,10 @@ def main(): help='Path to the vcf files to compare to the metadata') arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml', help='Path to the location of the results') + arg_parser.add_argument('--evidence_type_results', required=True, dest='evidence_type_results', + help='Results of the evidence check') + args = arg_parser.parse_args() logging_config.add_stdout_handler() - check_sample_name_concordance(args.metadata_json, args.vcf_files, args.output_yaml) + check_sample_name_concordance(args.metadata_json, args.vcf_files, args.output_yaml, args.evidence_type_results) diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index 51cec0cb..4d287401 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -105,20 +105,31 @@ workflow { collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect()) // Task-specific processing + evidence_type_results = null + if (params.tasks.contains(VCF_CHECK)) { check_vcf_valid(vcf_and_ref_ch) evidence_type_check(metadata_json, vcf_files.collect()) + evidence_type_results = evidence_type_check.out.evidence_type_checker_yml } if (params.tasks.contains(ASSEMBLY_CHECK)) { check_vcf_reference(vcf_and_ref_ch) insdc_checker(metadata_json, fasta_to_vcfs) } if (params.tasks.contains(METADATA_CHECK)) { + if (!evidence_type_results) { + evidence_type_check(metadata_json, vcf_files.collect()) + evidence_type_results = evidence_type_check.out.evidence_type_checker_yml + } metadata_json_validation(metadata_json) - metadata_semantic_check(metadata_json) + metadata_semantic_check(metadata_json, evidence_type_results) } if (params.tasks.contains(SAMPLE_CHECK)) { - sample_name_concordance(metadata_json, vcf_files.collect()) + if (!evidence_type_results){ + evidence_type_check(metadata_json, vcf_files.collect()) + evidence_type_results = evidence_type_check.out.evidence_type_checker_yml + } + sample_name_concordance(metadata_json, vcf_files.collect(), evidence_type_results) } } @@ -294,6 +305,7 @@ process sample_name_concordance { input: path(metadata_json) path(vcf_files) + path(evidence_type_results) output: path "sample_checker.yml", emit: sample_checker_yml @@ -301,7 +313,9 @@ process sample_name_concordance { script: """ - $params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml > sample_checker.log 2>&1 + $params.python_scripts.samples_checker --metadata_json $metadata_json \ + --vcf_files $vcf_files --output_yaml sample_checker.yml \ + --evidence_type_results $evidence_type_results > sample_checker.log 2>&1 """ } @@ -362,6 +376,7 @@ process metadata_semantic_check { input: path(metadata_json) + path(evidence_type_results) output: path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml @@ -369,6 +384,9 @@ process metadata_semantic_check { script: """ - $params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml > semantic_checker.log 2>&1 + $params.python_scripts.semantic_checker \ + --metadata_json $metadata_json \ + --evidence_type_results $evidence_type_results \ + --output_yaml metadata_semantic_check.yml > semantic_checker.log 2>&1 """ } diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index 5ce27cad..b4fa1112 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -20,6 +20,7 @@ PARENT_PROJECT_KEY = 'parentProject' CHILD_PROJECTS_KEY = 'childProjects' PEER_PROJECTS_KEY = 'peerProjects' +SAMPLE_IN_VCF_KEY = 'sampleInVCF' BIOSAMPLE_OBJECT_KEY = 'bioSampleObject' BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession' CHARACTERISTICS_KEY = 'characteristics' @@ -40,9 +41,10 @@ def cast_list(l, type_to_cast=str): class SemanticMetadataChecker(AppLogger): - def __init__(self, metadata, sample_checklist='ERC000011'): + def __init__(self, metadata, evidence_type_results, sample_checklist='ERC000011'): self.sample_checklist = sample_checklist self.metadata = metadata + self.evidence_type_results = evidence_type_results self.errors = [] # Caches whether taxonomy code is valid or not, and maps to scientific name if valid self.taxonomy_valid = {} @@ -60,6 +62,7 @@ def check_all(self): self.check_all_analysis_run_accessions() self.check_analysis_alias_coherence() self.check_all_analysis_contain_samples() + self.check_all_samples_have_sample_in_vcf() self.check_hold_date() def check_hold_date(self): @@ -316,3 +319,14 @@ def check_all_analysis_contain_samples(self): json_path = f'/{ANALYSIS_KEY}/{idx}' self.add_error(property=json_path, description=f'No sample found for the analysis. Should have at the least one sample.') + + def check_all_samples_have_sample_in_vcf(self): + for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): + json_path = f'/{SAMPLE_KEY}/{idx}/{SAMPLE_IN_VCF_KEY}' + analysis_aliases = sample.get(ANALYSIS_ALIAS_KEY, []) + if any([self.evidence_type_results.get(analysis_alias, {}).get('evidence_type') != 'allele_frequency' for + analysis_alias in analysis_aliases]): + # SampleInVCF is required + if sample.get(SAMPLE_IN_VCF_KEY) is None or sample.get(SAMPLE_IN_VCF_KEY) == '': + self.add_error(json_path, f'{SAMPLE_IN_VCF_KEY} must be provided when Genotypes are present in the VCF file') + diff --git a/tests/resources/sample_checker/evidence_type.yaml b/tests/resources/sample_checker/evidence_type.yaml new file mode 100644 index 00000000..154be1da --- /dev/null +++ b/tests/resources/sample_checker/evidence_type.yaml @@ -0,0 +1,6 @@ +VD1: + evidence_type: genotype +VD2: + evidence_type: genotype +VD3: + evidence_type: genotype diff --git a/tests/resources/sample_in_vcf_check/allele_freq.vcf b/tests/resources/sample_in_vcf_check/allele_freq.vcf new file mode 100644 index 00000000..2e018a96 --- /dev/null +++ b/tests/resources/sample_in_vcf_check/allele_freq.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##FILTER= +##contig= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10177 rs367896724 A AC 100 PASS AF=0.11;AN=2000;AC=220 +1 10505 rs548419688 A T 100 PASS AF=0.09;AN=2000;AC=180 diff --git a/tests/resources/sample_in_vcf_check/fake_fasta.fa b/tests/resources/sample_in_vcf_check/fake_fasta.fa new file mode 100644 index 00000000..8e8fc17c --- /dev/null +++ b/tests/resources/sample_in_vcf_check/fake_fasta.fa @@ -0,0 +1,2 @@ +>fasta +AAA diff --git a/tests/resources/sample_in_vcf_check/genotype.vcf b/tests/resources/sample_in_vcf_check/genotype.vcf new file mode 100644 index 00000000..7e316729 --- /dev/null +++ b/tests/resources/sample_in_vcf_check/genotype.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.1 +##FILTER= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3 +1 10177 rs367896724 A AC 100 PASS . GT 1|0 0|1 0|0 +1 10505 rs548419688 A T 100 PASS . GT 0|0 0|0 0|1 \ No newline at end of file diff --git a/tests/resources/sample_in_vcf_check/metadata_af_no_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_af_no_sample_in_vcf.json new file mode 100644 index 00000000..7d1b74ab --- /dev/null +++ b/tests/resources/sample_in_vcf_check/metadata_af_no_sample_in_vcf.json @@ -0,0 +1 @@ +{"project": {"title": "Test AF Project", "description": "Project with allele frequency VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["AF1"], "bioSampleAccession": "SAME00001"}], "analysis": [{"analysisTitle": "AF Analysis", "analysisAlias": "AF1", "description": "Allele frequency analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "AF1", "fileName": "allele_freq.vcf", "fileType": "vcf", "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", "fileSize": 458}]} \ No newline at end of file diff --git a/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json new file mode 100644 index 00000000..330ef9fc --- /dev/null +++ b/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json @@ -0,0 +1 @@ +{"project": {"title": "Test AF Project", "description": "Project with allele frequency VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["AF1"], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001"}], "analysis": [{"analysisTitle": "AF Analysis", "analysisAlias": "AF1", "description": "Allele frequency analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "AF1", "fileName": "allele_freq.vcf", "fileType": "vcf", "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", "fileSize": 458}]} \ No newline at end of file diff --git a/tests/resources/sample_in_vcf_check/metadata_genotype_no_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_genotype_no_sample_in_vcf.json new file mode 100644 index 00000000..78335608 --- /dev/null +++ b/tests/resources/sample_in_vcf_check/metadata_genotype_no_sample_in_vcf.json @@ -0,0 +1 @@ +{"project": {"title": "Test Genotype Project", "description": "Project with genotype VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00001"}, {"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00002"}, {"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00003"}], "analysis": [{"analysisTitle": "Genotype Analysis", "analysisAlias": "GT1", "description": "Genotype analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "GT1", "fileName": "genotype.vcf", "fileType": "vcf", "md5": "81ca0b3a6e5b657bc2be50085c76546a", "fileSize": 350}]} \ No newline at end of file diff --git a/tests/resources/sample_in_vcf_check/metadata_genotype_with_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_genotype_with_sample_in_vcf.json new file mode 100644 index 00000000..d4873d6f --- /dev/null +++ b/tests/resources/sample_in_vcf_check/metadata_genotype_with_sample_in_vcf.json @@ -0,0 +1 @@ +{"project": {"title": "Test Genotype Project", "description": "Project with genotype VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["GT1"], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001"}, {"analysisAlias": ["GT1"], "sampleInVCF": "sample2", "bioSampleAccession": "SAME00002"}, {"analysisAlias": ["GT1"], "sampleInVCF": "sample3", "bioSampleAccession": "SAME00003"}], "analysis": [{"analysisTitle": "Genotype Analysis", "analysisAlias": "GT1", "description": "Genotype analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "GT1", "fileName": "genotype.vcf", "fileType": "vcf", "md5": "81ca0b3a6e5b657bc2be50085c76546a", "fileSize": 350}]} \ No newline at end of file diff --git a/tests/test_native_validator_sample_in_vcf.py b/tests/test_native_validator_sample_in_vcf.py new file mode 100644 index 00000000..307704cb --- /dev/null +++ b/tests/test_native_validator_sample_in_vcf.py @@ -0,0 +1,96 @@ +import os +import shutil +from unittest import TestCase + +import pytest +import yaml + +from eva_sub_cli.validators.native_validator import NativeValidator +from eva_sub_cli.validators.validator import METADATA_CHECK +from tests.test_utils import create_mapping_file + + +@pytest.mark.integration('You need to install java, nextflow, vcf_validator, vcf_assembly_checker, biovalidator (and md5sum, stat for mac)') +class TestNativeValidatorSampleInVCF(TestCase): + resource_dir = os.path.join(os.path.dirname(__file__), 'resources') + sample_in_vcf_dir = os.path.join(resource_dir, 'sample_in_vcf_check') + fasta_file = os.path.join(sample_in_vcf_dir, 'fake_fasta.fa') + + def setUp(self): + self.test_run_dir = os.path.join(self.resource_dir, 'test_native_run') + os.makedirs(self.test_run_dir, exist_ok=True) + self.mapping_file = os.path.join(self.test_run_dir, 'vcf_files_metadata.csv') + + def tearDown(self): + shutil.rmtree(self.test_run_dir) + + def _build_validator(self, metadata_json, vcf_file, tasks): + create_mapping_file( + self.mapping_file, + vcf_files=[vcf_file], + fasta_files=[self.fasta_file], + assembly_reports=None, + ) + return NativeValidator( + mapping_file=self.mapping_file, + submission_dir=self.test_run_dir, + project_title='Test Project', + metadata_json=metadata_json, + validation_tasks=tasks, + ) + + def _get_semantic_errors(self): + semantic_yaml = os.path.join( + self.test_run_dir, 'validation_output', 'other_validations', 'metadata_semantic_check.yml' + ) + with open(semantic_yaml) as f: + return yaml.safe_load(f) or [] + + def _sample_in_vcf_errors(self, errors): + return [e for e in errors if 'sampleInVCF' in e.get('property', '')] + + + def test_af_vcf_without_sample_in_vcf(self): + """AF evidence type: omitting sampleInVCF should produce no error.""" + validator = self._build_validator( + os.path.join(self.sample_in_vcf_dir, 'metadata_af_no_sample_in_vcf.json'), + os.path.join(self.sample_in_vcf_dir, 'allele_freq.vcf'), + [METADATA_CHECK] + ) + validator.validate() + errors = self._get_semantic_errors() + self.assertEqual(self._sample_in_vcf_errors(errors), []) + + def test_af_vcf_with_sample_in_vcf(self): + """AF evidence type: providing sampleInVCF is permitted and should produce no error.""" + validator = self._build_validator( + os.path.join(self.sample_in_vcf_dir, 'metadata_af_with_sample_in_vcf.json'), + os.path.join(self.sample_in_vcf_dir, 'allele_freq.vcf'), + [METADATA_CHECK] + ) + validator.validate() + errors = self._get_semantic_errors() + self.assertEqual(self._sample_in_vcf_errors(errors), []) + + def test_genotype_vcf_without_sample_in_vcf(self): + """Genotype evidence type: omitting sampleInVCF must produce one error per sample.""" + validator = self._build_validator( + os.path.join(self.sample_in_vcf_dir, 'metadata_genotype_no_sample_in_vcf.json'), + os.path.join(self.sample_in_vcf_dir, 'genotype.vcf'), + [METADATA_CHECK] + ) + validator.validate() + errors = self._get_semantic_errors() + sample_in_vcf_errors = self._sample_in_vcf_errors(errors) + self.assertEqual(len(sample_in_vcf_errors), 3) + + def test_genotype_vcf_with_sample_in_vcf(self): + """Genotype evidence type: providing sampleInVCF for every sample should produce no error.""" + validator = self._build_validator( + os.path.join(self.sample_in_vcf_dir, 'metadata_genotype_with_sample_in_vcf.json'), + os.path.join(self.sample_in_vcf_dir, 'genotype.vcf'), + [METADATA_CHECK] + ) + validator.validate() + errors = self._get_semantic_errors() + self.assertEqual(self._sample_in_vcf_errors(errors), []) diff --git a/tests/test_samples_checker.py b/tests/test_samples_checker.py index 5508e602..bef7bd1d 100644 --- a/tests/test_samples_checker.py +++ b/tests/test_samples_checker.py @@ -49,7 +49,8 @@ def test_check_sample_name_concordance_absolute_paths(self): os.remove(updated_metadata) def run_and_assert_sample_check(self, metadata_json, vcf_files): - check_sample_name_concordance(metadata_json, vcf_files, self.output_yaml) + evidence_type_yaml = os.path.join(self.resource_dir, 'sample_checker', 'evidence_type.yaml') + check_sample_name_concordance(metadata_json, vcf_files, self.output_yaml, evidence_type_yaml) expected_results = { 'overall_differences': True, 'results_per_analysis': { diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py index de22550e..701551d1 100644 --- a/tests/test_semantic_metadata.py +++ b/tests/test_semantic_metadata.py @@ -72,7 +72,7 @@ def test_check_project_exists_and_public_in_ena_true(self): "projectAccession": "PRJEB12345" } } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: m_ena_download.side_effect = [True, HTTPError('problem downloading', response=Response())] checker.check_all_project_accessions() @@ -84,7 +84,7 @@ def test_check_project_exists_and_public_in_ena_false(self): "projectAccession": "PRJEBXYZ99" } } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: m_ena_download.side_effect = [HTTPError('problem downloading', response=Response())] checker.check_all_project_accessions() @@ -100,7 +100,7 @@ def test_check_all_project_accessions(self): "childProjects": ["PRJEB456", "PRJEBNA"] }, } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: m_ena_download.side_effect = [True, True, HTTPError('problem downloading', response=Response())] checker.check_all_project_accessions() @@ -133,7 +133,7 @@ def test_check_all_taxonomy_codes(self): } ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) with patch('eva_sub_cli.semantic_metadata.get_scientific_name_and_common_name') as m_get_sci_name: # Mock should only be called once per taxonomy code m_get_sci_name.side_effect = [('Homo sapiens', 'human'), Exception('problem downloading')] @@ -153,7 +153,7 @@ def test_check_uniqueness_analysis_alias(self): {"analysisAlias": "alias1"} ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_uniqueness_analysis_alias() self.assertEqual(checker.errors, [ { @@ -194,7 +194,7 @@ def test_check_all_scientific_names(self): } ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.taxonomy_valid = { 1234: False, 9606: "Homo sapiens" @@ -208,7 +208,7 @@ def test_check_all_scientific_names(self): ]) def test_check_existing_biosamples_with_checklist(self): - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) with patch.object(SemanticMetadataChecker, '_get_biosample', side_effect=[valid_sample, ValueError, invalid_sample1, invalid_sample2, old_invalid_sample, old_invalid_sample2]) as m_get_sample: checker.check_existing_biosamples() @@ -234,7 +234,7 @@ def test_check_existing_biosamples_with_checklist(self): self.assertTrue(len(checker.errors) == 5) def test_check_existing_biosamples(self): - checker = SemanticMetadataChecker(metadata, sample_checklist=None) + checker = SemanticMetadataChecker(metadata, {}, sample_checklist=None) with patch.object(NoAuthHALCommunicator, 'follows_link', side_effect=[valid_sample, ValueError, invalid_sample1, invalid_sample2, old_invalid_sample, old_invalid_sample2]) as m_follows_link: checker.check_existing_biosamples() @@ -251,7 +251,7 @@ def test_check_existing_real_biosamples(self): {"bioSampleAccession": "SAMN01894452"} ] } - checker = SemanticMetadataChecker(metadata, sample_checklist=None) + checker = SemanticMetadataChecker(metadata, {}, sample_checklist=None) checker.check_existing_biosamples() print(checker.errors) @@ -282,7 +282,7 @@ def test_check_analysis_alias_coherence(self): } ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_analysis_alias_coherence() self.assertEqual(checker.errors, [ {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, @@ -295,7 +295,7 @@ def test_check_all_analysis_run_accessions(self): {'runAccessions': ['SRR000001', 'SRR000002']} ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_all_analysis_run_accessions() assert checker.errors == [] @@ -318,7 +318,7 @@ def test_check_all_analysis_contain_samples(self): ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_all_analysis_contain_samples() assert checker.errors == [] @@ -334,7 +334,7 @@ def test_check_all_analysis_contain_samples(self): ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_all_analysis_contain_samples() self.assertEqual(len(checker.errors), 2) self.assertEqual(checker.errors[0]["property"], "/analysis/1") @@ -350,25 +350,51 @@ def test_check_all_analysis_contain_samples(self): "sample": [] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_all_analysis_contain_samples() self.assertEqual(len(checker.errors), 1) self.assertEqual(checker.errors[0]["property"], "/analysis/0") self.assertEqual(checker.errors[0]["description"], "No sample found for the analysis. Should have at the least one sample.") + def test_check_all_samples_have_sample_in_vcf(self): + # Sample with genotype evidence + sampleInVCF present → no error + metadata = { + "sample": [{"analysisAlias": ["A1"], "sampleInVCF": "sample1"}] + } + checker = SemanticMetadataChecker(metadata, evidence_type_results={'A1': {'evidence_type': 'genotype'}}) + checker.check_all_samples_have_sample_in_vcf() + self.assertEqual(checker.errors, []) + + # Sample with genotype evidence + sampleInVCF missing → error + metadata = { + "sample": [{"analysisAlias": ["A1"]}] + } + checker = SemanticMetadataChecker(metadata, evidence_type_results={'A1': {'evidence_type': 'genotype'}}) + checker.check_all_samples_have_sample_in_vcf() + self.assertEqual(len(checker.errors), 1) + self.assertEqual(checker.errors[0]['property'], '/sample/0/sampleInVCF') + + # Sample with allele_frequency evidence + sampleInVCF missing → no error + metadata = { + "sample": [{"analysisAlias": ["A1"]}] + } + checker = SemanticMetadataChecker(metadata, evidence_type_results={'A1': {'evidence_type': 'allele_frequency'}}) + checker.check_all_samples_have_sample_in_vcf() + self.assertEqual(checker.errors, []) + def test_check_hold_date(self): # No error when holdDate is within 2 years hold_date_ok = (datetime.now() + timedelta(days=365)).strftime('%Y-%m-%d') metadata = {"project": {"holdDate": hold_date_ok}, "sample": [], "analysis": [], "files": []} - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_hold_date() self.assertEqual(checker.errors, []) # Error when holdDate is more than 2 years in the future hold_date_bad = (datetime.now() + timedelta(days=365 * 3)).strftime('%Y-%m-%d') metadata = {"project": {"holdDate": hold_date_bad}, "sample": [], "analysis": [], "files": []} - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_hold_date() self.assertEqual(checker.errors, [ {'property': '/project/holdDate', 'description': 'holdDate is more than 2 years in the future'} @@ -376,6 +402,6 @@ def test_check_hold_date(self): # No error when holdDate is absent metadata = {"project": {}, "sample": [], "analysis": [], "files": []} - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_hold_date() self.assertEqual(checker.errors, []) \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index 6cf26d2c..d1be59a1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,10 +1,12 @@ import csv -def create_mapping_file(mapping_file, vcf_files, fasta_files, assembly_reports): +def create_mapping_file(mapping_file, vcf_files, fasta_files, assembly_reports=None): with open(mapping_file, 'w', encoding='UTF8') as f: writer = csv.writer(f) writer.writerow(['vcf', 'fasta', 'report']) + if not assembly_reports: + assembly_reports = ['' for _ in range(len(vcf_files))] for vcf_file, fasta_file, assembly_reports in zip(vcf_files, fasta_files, assembly_reports): writer.writerow([vcf_file, fasta_file, assembly_reports])