EBIvariation · tcezard · Mar 19, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 12, 2026
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -0,0 +1,56 @@
+name: Integration tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main]
+
+env:
+  VCF_VALIDATOR_VERSION: "0.10.2"
+  NXF_VER: "23.10.0"
+
+jobs:
+  integration-tests:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install Java and Node
+        run: sudo apt update && sudo apt install -y default-jdk nodejs npm git curl
+
+      - name: Install vcf-validator and vcf-assembly-checker
+        run: |
+          curl -LJo /usr/local/bin/vcf_validator \
+            https://github.com/EBIvariation/vcf-validator/releases/download/v${VCF_VALIDATOR_VERSION}/vcf_validator_linux
+          curl -LJo /usr/local/bin/vcf_assembly_checker \
+            https://github.com/EBIvariation/vcf-validator/releases/download/v${VCF_VALIDATOR_VERSION}/vcf_assembly_checker_linux
+          chmod 755 /usr/local/bin/vcf_validator /usr/local/bin/vcf_assembly_checker
+
+      - name: Install biovalidator
+        run: |
+          git clone https://github.com/elixir-europe/biovalidator.git
+          cd biovalidator
+          npm install
+          sudo npm link
+
+      - name: Install Nextflow
+        run: |
+          curl -L "https://github.com/nextflow-io/nextflow/releases/download/v${NXF_VER}/nextflow-${NXF_VER}-all" | bash
+          sudo mv nextflow /usr/local/bin/
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pytest
+          python -m pip install .
+
+      - name: Run integration tests
+        run: |
+          PYTHONPATH=. pytest tests -m integration
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM --platform=linux/amd64 python:3.10
+FROM python:3.10
 
 ENV vcf_validator_version=0.10.2
 ENV NXF_VER=23.10.0

diff --git a/docs/input_file_overview.md b/docs/input_file_overview.md
@@ -98,7 +98,8 @@ alias, which is a shortened identifier you must provide for each analysis.
 
 This is where you describe the biological samples used for your analyses. Each row describes one sample and must include
 the Analysis Alias to indicate which analysis it belongs to, and "Sample Name in VCF" which is the exact name of the 
-sample as it appears in the VCF file.
+sample as it appears in the VCF file. If you are submitting a VCF without sample names (containing only allele frequencies),
+then you can omit the "Sample Name in VCF" column.
 
 We accept preregistered samples, which should be provided using BioSamples sample or sampleset accessions. Please
 ensure these are publicly accessible, as otherwise EVA will not be able to validate them.

diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json
@@ -318,7 +318,6 @@
           {
             "required": [
               "analysisAlias",
-              "sampleInVCF",
               "bioSampleObject"
             ]
           }
@@ -334,7 +333,7 @@
           },
           "sampleInVCF": {
             "type": "string",
-            "description": "Sample Name used in the VCF file"
+            "description": "Sample Name used in the VCF file. It is a required field when genotypes are provided."
           },
           "bioSampleAccession": {
             "type": "string",

diff --git a/eva_sub_cli/executables/check_metadata_semantics.py b/eva_sub_cli/executables/check_metadata_semantics.py
@@ -1,18 +1,23 @@
 import argparse
 import json
 
+import yaml
+
 from eva_sub_cli.semantic_metadata import SemanticMetadataChecker
 
 
 def main():
     arg_parser = argparse.ArgumentParser(description='Perform semantic checks on the metadata')
     arg_parser.add_argument('--metadata_json', required=True, dest='metadata_json', help='EVA metadata json file')
+    arg_parser.add_argument('--evidence_type_results', required=True, dest='evidence_type_results', help='Results of the evidence check')
     arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
                             help='Path to the location of the results')
     args = arg_parser.parse_args()
 
     with open(args.metadata_json) as open_json:
         metadata = json.load(open_json)
-        checker = SemanticMetadataChecker(metadata)
-        checker.check_all()
-        checker.write_result_yaml(args.output_yaml)
+    with open(args.evidence_type_results) as open_yaml:
+        evidence_type_results = yaml.safe_load(open_yaml)
+    checker = SemanticMetadataChecker(metadata, evidence_type_results)
+    checker.check_all()
+    checker.write_result_yaml(args.output_yaml)
diff --git a/eva_sub_cli/executables/samples_checker.py b/eva_sub_cli/executables/samples_checker.py
@@ -54,7 +54,7 @@ def compare_names_in_files_and_samples(sample_name_in_analysis, sample_name_per_
             more_metadata_submitted_files)
 
 
-def compare_all_analysis(metadata, files_per_analysis):
+def compare_all_analysis(metadata, files_per_analysis, evidence_type_results):
     overall_differences = False
     results_per_analysis_alias = {}
     all_analysis_alias = set(metadata.samples_per_analysis) | set(files_per_analysis)
@@ -65,7 +65,7 @@ def compare_all_analysis(metadata, files_per_analysis):
             for file_path in files_per_analysis.get(analysis_alias, [])
         }
 
-        if need_to_check_samples(sample_name_per_file):
+        if need_to_check_samples(evidence_type_results, analysis_alias):
             (
                 has_difference, more_per_submitted_files_metadata,
                 more_submitted_files_metadata, more_metadata_submitted_files
@@ -89,13 +89,9 @@ def compare_all_analysis(metadata, files_per_analysis):
     return overall_differences, results_per_analysis_alias
 
 
-def need_to_check_samples(sample_name_per_file):
-    no_samples_in_vcf = all(len(v) == 0 for v in sample_name_per_file.values())
-    if no_samples_in_vcf:
-        evidence_types_for_vcf_files = [detect_vcf_evidence_type(vcf_file) for vcf_file in sample_name_per_file.keys()]
-        if set(evidence_types_for_vcf_files) == {'allele_frequency'}:
-            return False
-
+def need_to_check_samples(evidence_type_results, analysis_alias):
+    if evidence_type_results.get(analysis_alias, {}).get('evidence_type') == 'allele_frequency':
+        return False
     return True
 
 
@@ -107,14 +103,16 @@ def write_result_yaml(output_yaml, overall_differences, results_per_analysis_ali
         }, stream=open_yaml)
 
 
-def check_sample_name_concordance(metadata_json, vcf_files, output_yaml):
+def check_sample_name_concordance(metadata_json, vcf_files, output_yaml, evidence_type_result_file):
     """
     Take the metadata following EVA standard and formatted in JSON then compare the sample names in it to the ones
     found in the VCF files
     """
     metadata = EvaMetadataJson(metadata_json)
+    with open(evidence_type_result_file) as open_yaml:
+        evidence_type_results = yaml.safe_load(open_yaml)
     file_path_per_analysis = associate_vcf_path_with_analysis(metadata, vcf_files)
-    overall_differences, results_per_analysis_alias = compare_all_analysis(metadata, file_path_per_analysis)
+    overall_differences, results_per_analysis_alias = compare_all_analysis(metadata, file_path_per_analysis, evidence_type_results)
     write_result_yaml(output_yaml, overall_differences, results_per_analysis_alias)
 
 
@@ -127,7 +125,10 @@ def main():
                             help='Path to the vcf files to compare to the metadata')
     arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml',
                             help='Path to the location of the results')
+    arg_parser.add_argument('--evidence_type_results', required=True, dest='evidence_type_results',
+                            help='Results of the evidence check')
+
 
     args = arg_parser.parse_args()
     logging_config.add_stdout_handler()
-    check_sample_name_concordance(args.metadata_json, args.vcf_files, args.output_yaml)
+    check_sample_name_concordance(args.metadata_json, args.vcf_files, args.output_yaml, args.evidence_type_results)
diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
@@ -105,20 +105,31 @@ workflow {
 	collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect())
 
 	// Task-specific processing
+	evidence_type_results = null
+
     if (params.tasks.contains(VCF_CHECK)) {
         check_vcf_valid(vcf_and_ref_ch)
         evidence_type_check(metadata_json, vcf_files.collect())
+        evidence_type_results = evidence_type_check.out.evidence_type_checker_yml
 	}
 	if (params.tasks.contains(ASSEMBLY_CHECK)) {
 		check_vcf_reference(vcf_and_ref_ch)
 		insdc_checker(metadata_json, fasta_to_vcfs)
 	}
 	if (params.tasks.contains(METADATA_CHECK)) {
+	    if (!evidence_type_results) {
+            evidence_type_check(metadata_json, vcf_files.collect())
+            evidence_type_results = evidence_type_check.out.evidence_type_checker_yml
+	    }
 		metadata_json_validation(metadata_json)
-		metadata_semantic_check(metadata_json)
+		metadata_semantic_check(metadata_json, evidence_type_results)
 	}
 	if (params.tasks.contains(SAMPLE_CHECK)) {
-		sample_name_concordance(metadata_json, vcf_files.collect())
+	    if (!evidence_type_results){
+            evidence_type_check(metadata_json, vcf_files.collect())
+            evidence_type_results = evidence_type_check.out.evidence_type_checker_yml
+	    }
+		sample_name_concordance(metadata_json, vcf_files.collect(), evidence_type_results)
 	}
 }
 
@@ -294,14 +305,17 @@ process sample_name_concordance {
     input:
     path(metadata_json)
     path(vcf_files)
+    path(evidence_type_results)
 
     output:
     path "sample_checker.yml", emit: sample_checker_yml
     path "sample_checker.log", emit: sample_checker_log
 
     script:
     """
-    $params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml > sample_checker.log 2>&1
+    $params.python_scripts.samples_checker --metadata_json $metadata_json \
+        --vcf_files $vcf_files --output_yaml sample_checker.yml \
+        --evidence_type_results $evidence_type_results > sample_checker.log 2>&1
     """
 }
 
@@ -362,13 +376,17 @@ process metadata_semantic_check {
 
     input:
     path(metadata_json)
+    path(evidence_type_results)
 
     output:
     path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml
     path "semantic_checker.log", emit: semantic_checker_log
 
     script:
     """
-    $params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml > semantic_checker.log 2>&1
+    $params.python_scripts.semantic_checker \
+    --metadata_json $metadata_json \
+    --evidence_type_results $evidence_type_results \
+    --output_yaml metadata_semantic_check.yml > semantic_checker.log 2>&1
     """
 }
diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py
@@ -20,6 +20,7 @@
 PARENT_PROJECT_KEY = 'parentProject'
 CHILD_PROJECTS_KEY = 'childProjects'
 PEER_PROJECTS_KEY = 'peerProjects'
+SAMPLE_IN_VCF_KEY = 'sampleInVCF'
 BIOSAMPLE_OBJECT_KEY = 'bioSampleObject'
 BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession'
 CHARACTERISTICS_KEY = 'characteristics'
@@ -40,9 +41,10 @@ def cast_list(l, type_to_cast=str):
 
 class SemanticMetadataChecker(AppLogger):
 
-    def __init__(self, metadata, sample_checklist='ERC000011'):
+    def __init__(self, metadata, evidence_type_results, sample_checklist='ERC000011'):
         self.sample_checklist = sample_checklist
         self.metadata = metadata
+        self.evidence_type_results = evidence_type_results
         self.errors = []
         # Caches whether taxonomy code is valid or not, and maps to scientific name if valid
         self.taxonomy_valid = {}
@@ -60,6 +62,7 @@ def check_all(self):
         self.check_all_analysis_run_accessions()
         self.check_analysis_alias_coherence()
         self.check_all_analysis_contain_samples()
+        self.check_all_samples_have_sample_in_vcf()
         self.check_hold_date()
 
     def check_hold_date(self):
@@ -316,3 +319,14 @@ def check_all_analysis_contain_samples(self):
                 json_path = f'/{ANALYSIS_KEY}/{idx}'
                 self.add_error(property=json_path,
                                description=f'No sample found for the analysis. Should have at the least one sample.')
+
+    def check_all_samples_have_sample_in_vcf(self):
+        for idx, sample in enumerate(self.metadata[SAMPLE_KEY]):
+            json_path = f'/{SAMPLE_KEY}/{idx}/{SAMPLE_IN_VCF_KEY}'
+            analysis_aliases = sample.get(ANALYSIS_ALIAS_KEY, [])
+            if any([self.evidence_type_results.get(analysis_alias, {}).get('evidence_type') != 'allele_frequency' for
+                        analysis_alias in analysis_aliases]):
+                # SampleInVCF is required
+                if sample.get(SAMPLE_IN_VCF_KEY) is None or sample.get(SAMPLE_IN_VCF_KEY) == '':
+                    self.add_error(json_path, f'{SAMPLE_IN_VCF_KEY} must be provided when Genotypes are present in the VCF file')
+
diff --git a/tests/resources/sample_checker/evidence_type.yaml b/tests/resources/sample_checker/evidence_type.yaml
@@ -0,0 +1,6 @@
+VD1:
+  evidence_type: genotype
+VD2:
+  evidence_type: genotype
+VD3:
+  evidence_type: genotype
diff --git a/tests/resources/sample_in_vcf_check/allele_freq.vcf b/tests/resources/sample_in_vcf_check/allele_freq.vcf
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.1
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=1,length=249250621>
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total Allele Count">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele Count">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	10177	rs367896724	A	AC	100	PASS	AF=0.11;AN=2000;AC=220
+1	10505	rs548419688	A	T	100	PASS	AF=0.09;AN=2000;AC=180
diff --git a/tests/resources/sample_in_vcf_check/fake_fasta.fa b/tests/resources/sample_in_vcf_check/fake_fasta.fa
@@ -0,0 +1,2 @@
+>fasta
+AAA
diff --git a/tests/resources/sample_in_vcf_check/genotype.vcf b/tests/resources/sample_in_vcf_check/genotype.vcf
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.1
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=1,assembly=b37,length=249250621>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample1	sample2	sample3
+1	10177	rs367896724	A	AC	100	PASS	.	GT	1|0	0|1	0|0
+1	10505	rs548419688	A	T	100	PASS	.	GT	0|0	0|0	0|1
diff --git a/tests/resources/sample_in_vcf_check/metadata_af_no_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_af_no_sample_in_vcf.json
@@ -0,0 +1 @@
+{"project": {"title": "Test AF Project", "description": "Project with allele frequency VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["AF1"], "bioSampleAccession": "SAME00001"}], "analysis": [{"analysisTitle": "AF Analysis", "analysisAlias": "AF1", "description": "Allele frequency analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "AF1", "fileName": "allele_freq.vcf", "fileType": "vcf", "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", "fileSize": 458}]}
diff --git a/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json
@@ -0,0 +1 @@
+{"project": {"title": "Test AF Project", "description": "Project with allele frequency VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["AF1"], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001"}], "analysis": [{"analysisTitle": "AF Analysis", "analysisAlias": "AF1", "description": "Allele frequency analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "AF1", "fileName": "allele_freq.vcf", "fileType": "vcf", "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", "fileSize": 458}]}
diff --git a/tests/resources/sample_in_vcf_check/metadata_genotype_no_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_genotype_no_sample_in_vcf.json
@@ -0,0 +1 @@
+{"project": {"title": "Test Genotype Project", "description": "Project with genotype VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00001"}, {"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00002"}, {"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00003"}], "analysis": [{"analysisTitle": "Genotype Analysis", "analysisAlias": "GT1", "description": "Genotype analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "GT1", "fileName": "genotype.vcf", "fileType": "vcf", "md5": "81ca0b3a6e5b657bc2be50085c76546a", "fileSize": 350}]}
diff --git a/tests/resources/sample_in_vcf_check/metadata_genotype_with_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_genotype_with_sample_in_vcf.json
@@ -0,0 +1 @@
+{"project": {"title": "Test Genotype Project", "description": "Project with genotype VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["GT1"], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001"}, {"analysisAlias": ["GT1"], "sampleInVCF": "sample2", "bioSampleAccession": "SAME00002"}, {"analysisAlias": ["GT1"], "sampleInVCF": "sample3", "bioSampleAccession": "SAME00003"}], "analysis": [{"analysisTitle": "Genotype Analysis", "analysisAlias": "GT1", "description": "Genotype analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "GT1", "fileName": "genotype.vcf", "fileType": "vcf", "md5": "81ca0b3a6e5b657bc2be50085c76546a", "fileSize": 350}]}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"project": {"title": "Test AF Project", "description": "Project with allele frequency VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["AF1"], "bioSampleAccession": "SAME00001"}], "analysis": [{"analysisTitle": "AF Analysis", "analysisAlias": "AF1", "description": "Allele frequency analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "AF1", "fileName": "allele_freq.vcf", "fileType": "vcf", "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", "fileSize": 458}]}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"project": {"title": "Test Genotype Project", "description": "Project with genotype VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00001"}, {"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00002"}, {"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00003"}], "analysis": [{"analysisTitle": "Genotype Analysis", "analysisAlias": "GT1", "description": "Genotype analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "GT1", "fileName": "genotype.vcf", "fileType": "vcf", "md5": "81ca0b3a6e5b657bc2be50085c76546a", "fileSize": 350}]}