From d0a81f676a911359c1fd62ca87821558c793ea90 Mon Sep 17 00:00:00 2001
From: Juanje Mendoza <juanjemd@gmail.com>
Date: Wed, 14 Jan 2026 07:58:58 +0100
Subject: [PATCH] parser dockerfile. Docs. Fixes #725

---
 docs/dockerfiledoc.md                    |  30 +++
 docs/supported_metadata_files.md         |   2 +
 src/somef/parser/dockerfile_parser.py    | 244 ++++++++++++++++++++---
 src/somef/process_files.py               |  31 ++-
 src/somef/test/test_JSON_export.py       |  83 +-------
 src/somef/test/test_dockerfile_parser.py | 190 ++++++++++++++++++
 src/somef/utils/constants.py             |  14 +-
 7 files changed, 460 insertions(+), 134 deletions(-)
 create mode 100644 docs/dockerfiledoc.md
 create mode 100644 src/somef/test/test_dockerfile_parser.py
diff --git a/docs/dockerfiledoc.md b/docs/dockerfiledoc.md
new file mode 100644
index 00000000..6dc668fd
--- /dev/null
+++ b/docs/dockerfiledoc.md
@@ -0,0 +1,30 @@
+The following metadata fields can be extracted from a Dockerfile.
+These fields are defined using Dockerfile `LABEL` instructions as described in the
+[Dockerfile reference](https://docs.docker.com/reference/dockerfile/) and are interpreted
+according to the OCI Image Specification, following the
+[mapping for OCI image annotations](https://github.com/opencontainers/image-spec/blob/main/annotations.md#pre-defined-annotation-keys).
+
+| Software metadata category  | SOMEF metadata JSON path                | DOCKERFILE metadata file field     |
+|-----------------------------|-----------------------------------------|------------------------------------| 
+| authors                       |     authors[i].result.value           |   org.opencontainers.image.authors *(1)*  |
+| authors                       |     authors[i].result.value           |   LABEL maintainer *(1)*  |
+| code_repository               |     code_repository[i].result.value   |   org.opencontainers.image.url     |
+| description                   |     description[i].result.value       |     org.opencontainers.image.description    |
+| documentation                 |     documentation[i].result.value     |   org.opencontainers.image.documentation    |
+| license                       |     license[i].result.value           |     org.opencontainers.image.licenses    |
+| name                          |     name[i].result.value              |     org.opencontainers.image.ref.name         |
+| owner                         |     owner[i].result.value             |   org.opencontainers.image.vendor    |
+| version                       |     version[i].result.value           |   org.opencontainers.image.version     |
+
+
+---
+
+
+*(1)*  
+- Example: 
+```
+LABEL maintainer="The Prometheus Authors <prometheus-developers@googlegroups.com>"
+LABEL org.opencontainers.image.authors="The Prometheus Authors" \
+```
+
+
diff --git a/docs/supported_metadata_files.md b/docs/supported_metadata_files.md
index a02a035b..cec1fd85 100644
--- a/docs/supported_metadata_files.md
+++ b/docs/supported_metadata_files.md
@@ -24,6 +24,8 @@ SOMEF can extract metadata from a wide range of files commonly found in software
 | `*.gemspec`        | Ruby                       | Manifest file serves as the package descriptor used in Ruby gem projects. | <div align="center">[🔍](./gemspec.md)</div>| [📄](https://guides.rubygems.org/specification-reference/)| |[Example](https://github.com/rubygems/rubygems/blob/master/bundler/bundler.gemspec) |
 | `cargo.toml`       | Rust                       | Manifest file serves as the package descriptor used in Rust projects | <div align="center">[🔍](./cargo.md)</div> | [📄](https://doc.rust-lang.org/cargo/reference/manifest.html)| |[Example](https://github.com/rust-lang/cargo/blob/master/Cargo.toml) |
 | `*.cabal`       | Haskell                       | Manifest file serving as the package descriptor for Haskell projects.| <div align="center">[🔍](./cabal.md)</div> | [📄](https://cabal.readthedocs.io/en/3.10/cabal-package.html)| |[Example](https://github.com/haskell/cabal/blob/master/Cabal/Cabal.cabal) |
+| `dockerfile`       | Dockerfile                       | Build specification file for container images that can include software metadata via LABEL instructions (OCI specification).| <div align="center">[🔍](./dockerfiledoc.md)</div> | [📄](https://docs.docker.com/reference/dockerfile/)| |[Example](https://github.com/FairwindsOps/nova/blob/master/Dockerfile) |
+
 
 > **Note:** The general principles behind metadata mapping in SOMEF are based on the [CodeMeta crosswalk](https://github.com/codemeta/codemeta/blob/master/crosswalk.csv) and the [CodeMeta JSON-LD context](https://github.com/codemeta/codemeta/blob/master/codemeta.jsonld).  
 > However, each supported file type may have specific characteristics and field interpretations.
diff --git a/src/somef/parser/dockerfile_parser.py b/src/somef/parser/dockerfile_parser.py
index a7ac96c2..d47cd449 100644
--- a/src/somef/parser/dockerfile_parser.py
+++ b/src/somef/parser/dockerfile_parser.py
@@ -2,46 +2,226 @@
 import os
 import re
 from ..utils import constants
+from ..process_results import Result
+
+def parse_dockerfile(file_path, metadata_result: Result, source):
+
+    print(f"Extracting properties from Dockerfile: {file_path}")
 
-def extract_dockerfile_maintainer(file_path):
-    print(f"Extracting maintainers from Dockerfile: {file_path}")
-    maintainers = []
-    unique_maintainers = [] 
     try:
         with open(file_path, "rb") as file:
             raw_data = file.read()
 
-        try:
-            content = raw_data.decode("utf-8")
-        except UnicodeDecodeError:
-            logging.warning(f"File {file_path} is not UTF-8 decodable. Skipping.")
-            return maintainers
-
-        # not sure if should be better property author or a new property of maintainer
-        oci_match = re.findall(
-            constants.REGEXP_MAINTAINER_LABEL_OCI,
-            content,
-            re.IGNORECASE | re.MULTILINE
+        content = raw_data.decode("utf-8")
+    except (OSError, UnicodeDecodeError) as e:
+        logging.warning(f"Could not process Dockerfile {file_path}: {e}")
+        return None
+
+    # print(content)
+    title_match = re.search(
+        constants.REGEXP_DOCKER_TITLE,
+        content,
+        re.IGNORECASE
+    )
+
+    if title_match:
+        title = title_match.group(1).strip()
+        if title:
+            metadata_result.add_result(
+                constants.CAT_NAME,
+                {
+                    "value": title,
+                    "type": constants.STRING
+                },
+                1,
+                constants.TECHNIQUE_CODE_CONFIG_PARSER,
+                source
+            )
+
+    description_match = re.search(
+        constants.REGEXP_DOCKER_DESCRIPTION,
+        content,
+        re.IGNORECASE
+    )
+
+    if description_match:
+        description = description_match.group(1).strip()
+        if description:
+            metadata_result.add_result(
+                constants.CAT_DESCRIPTION,
+                {
+                    "value": description,
+                    "type": constants.STRING
+                },
+                1,
+                constants.TECHNIQUE_CODE_CONFIG_PARSER,
+                source
+            )
+
+    licenses_match = re.search(constants.REGEXP_DOCKER_LICENSES, content, re.IGNORECASE)
+    if licenses_match:
+        license_info_spdx = detect_license_spdx(licenses_match.group(1).strip())
+                    
+        if license_info_spdx:
+            license_data = {
+                "value": licenses_match.group(1).strip(),
+                "spdx_id": license_info_spdx.get('spdx_id'),
+                "name": license_info_spdx.get('name'),
+                "type": constants.LICENSE
+            }
+        else:
+            license_data = {
+                "value": licenses_match.group(1).strip(),
+                "type": constants.LICENSE
+            }
+        metadata_result.add_result(
+            constants.CAT_LICENSE,
+            license_data,
+            1,
+            constants.TECHNIQUE_CODE_CONFIG_PARSER,
+            source
+        )
+
+
+    # source_match = re.search(constants.REGEXP_DOCKER_SOURCE, content, re.IGNORECASE)
+    # if source_match:
+    #     properties[constants.PROP_SOURCE] = source_match.group(1).strip()
+
+    url_match = re.search(constants.REGEXP_DOCKER_URL, content, re.IGNORECASE)
+    if url_match:
+        metadata_result.add_result(
+            constants.CAT_CODE_REPOSITORY,
+            {
+                "value": url_match.group(1).strip(),
+                "type": constants.URL
+            },
+            1,
+            constants.TECHNIQUE_CODE_CONFIG_PARSER,
+            source
+        )
+
+    version_match = re.search(constants.REGEXP_DOCKER_VERSION, content, re.IGNORECASE)
+    if version_match:
+            metadata_result.add_result(
+            constants.CAT_VERSION,
+            {
+                "value": version_match.group(1).strip(),
+                "type": constants.RELEASE,
+                "tag": version_match.group(1).strip()
+            },
+            1,
+            constants.TECHNIQUE_CODE_CONFIG_PARSER,
+            source
         )
-        # LABEL maintainer free
-        label_match = re.findall(
-            constants.REGEXP_MAINTAINER_LABEL_FREE,
-            content,
-            re.IGNORECASE | re.MULTILINE
+
+    documentation_match = re.search(constants.REGEXP_DOCKER_DOCUMENTATION, content, re.IGNORECASE)
+    if documentation_match:
+        metadata_result.add_result(
+            constants.CAT_DOCUMENTATION,
+            {
+                "value": documentation_match.group(1).strip(),
+                "type": constants.STRING
+            },
+            1,
+            constants.TECHNIQUE_CODE_CONFIG_PARSER,
+            source
         )
-        # Deprecated maintainer
-        maintainer_match = re.findall(
-            constants.REGEXP_MAINTAINER,
-            content,
-            re.IGNORECASE | re.MULTILINE
+
+
+    vendor_match = re.search(
+        constants.REGEXP_DOCKER_VENDOR,
+        content,
+        re.IGNORECASE
+    )
+
+    if vendor_match:
+        vendor = vendor_match.group(1).strip()
+        if vendor:
+            if vendor and re.search(constants.REGEXP_LTD_INC, vendor, re.IGNORECASE):
+                type_vendor = "Organization"
+            else:
+                type_vendor = "Person"
+
+            metadata_result.add_result(
+                constants.CAT_OWNER,
+                {
+                    "value": vendor,
+                    "type": type_vendor
+                },
+                1,
+                constants.TECHNIQUE_CODE_CONFIG_PARSER,
+                source
+            )
+
+    # Extract maintainers
+    maintainers = []
+    unique_maintainers = [] 
+
+    maintainer_oci_match = re.findall(
+        constants.REGEXP_MAINTAINER_LABEL_OCI,
+        content,
+        re.IGNORECASE | re.MULTILINE
+    )
+    # LABEL maintainer free
+    maintanainer_label_match = re.findall(
+        constants.REGEXP_MAINTAINER_LABEL_FREE,
+        content,
+        re.IGNORECASE | re.MULTILINE
+    )
+    # Deprecated maintainer
+    maintainer_match = re.findall(
+        constants.REGEXP_MAINTAINER,
+        content,
+        re.IGNORECASE | re.MULTILINE
+    )
+    maintainers.extend(maintainer_oci_match)
+    maintainers.extend(maintanainer_label_match)
+    maintainers.extend(maintainer_match)
+
+    unique_maintainers = list({m.strip() for m in maintainers if m.strip()})
+
+    for maintainer in unique_maintainers:         
+        metadata_result.add_result(
+            constants.CAT_AUTHORS,
+            {
+                "type": constants.AGENT,
+                "value": maintainer
+            },
+            1,
+            constants.TECHNIQUE_CODE_CONFIG_PARSER,
+            source
         )
 
-        maintainers.extend(oci_match)
-        maintainers.extend(label_match)
-        maintainers.extend(maintainer_match)
+    return metadata_result
+
+def detect_license_spdx(license_text):
+    """
+    Function that given a license text, infers the name and spdx id in a dockerfile
+    Parameters
+    ----------
+    license_text
+
+    Returns
+    -------
+    A JSON dictionary with name and spdx id
+    """
+    print("Detecting license from text:", license_text)
+    for license_name, license_info in constants.LICENSES_DICT.items():
+        if re.search(license_info["regex"], license_text, re.IGNORECASE):
+            return {
+                "name": license_name,
+                "spdx_id": f"{license_info['spdx_id']}",
+                "@id": f"https://spdx.org/licenses/{license_info['spdx_id']}"
+            }
 
-        unique_maintainers = list({m.strip() for m in maintainers if m.strip()})
-    except OSError:
-        logging.warning(f"Could not read Dockerfile {file_path}")
+    for license_name, license_info in constants.LICENSES_DICT.items():
+        spdx_id = license_info["spdx_id"]
+        if re.search(rf'\b{re.escape(spdx_id)}\b', license_text, re.IGNORECASE):
+            return {
+                "name": license_name,
+                "spdx_id": spdx_id,
+                "@id": f"https://spdx.org/licenses/{spdx_id}"
+            }
+    return None
 
-    return unique_maintainers
+ 
\ No newline at end of file
diff --git a/src/somef/process_files.py b/src/somef/process_files.py
index b15d34d6..84e55560 100644
--- a/src/somef/process_files.py
+++ b/src/somef/process_files.py
@@ -21,7 +21,7 @@
 from .parser.description_parser import parse_description_file
 from .parser.toml_parser import parse_toml_file
 from .parser.cabal_parser import parse_cabal_file
-from .parser.dockerfile_parser import extract_dockerfile_maintainer
+from .parser.dockerfile_parser import parse_dockerfile
 from chardet import detect
 
 
@@ -76,20 +76,22 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
                 if filename == "Dockerfile" or filename.lower() == "docker-compose.yml":
                     docker_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir,
                                                repo_relative_path, filename)
-                    if filename == "Dockerfile":
-                        format_file = constants.FORMAT_DOCKERFILE
-                        maintainers = extract_dockerfile_maintainer(os.path.join(repo_dir, file_path))
-                    else:
-                        format_file = constants.FORMAT_DOCKER_COMPOSE
-                        maintainers = None
+                    
+                    # full_path = os.path.join(repo_dir, file_path)
 
                     result_value = {
                         constants.PROP_VALUE: docker_url,
                         constants.PROP_TYPE: constants.URL,
-                        constants.PROP_FORMAT: format_file
                     }
-                    if maintainers:
-                        result_value[constants.PROP_AUTHOR] = maintainers
+
+                    if filename == "Dockerfile":
+                        format_file = constants.FORMAT_DOCKERFILE
+                        result_value[constants.PROP_FORMAT] = format_file
+                        metadata_result = parse_dockerfile(os.path.join(dir_path, filename), metadata_result, docker_url)
+                    else:
+                        format_file = constants.FORMAT_DOCKER_COMPOSE
+
+                    result_value[constants.PROP_FORMAT] = format_file
 
                     metadata_result.add_result(
                         constants.CAT_HAS_BUILD_FILE,
@@ -98,14 +100,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
                         constants.TECHNIQUE_FILE_EXPLORATION,
                         docker_url
                     )
-                    # metadata_result.add_result(constants.CAT_HAS_BUILD_FILE,
-                    #                            {
-                    #                                constants.PROP_VALUE: docker_url,
-                    #                                constants.PROP_TYPE: constants.URL,
-                    #                                constants.PROP_FORMAT: format_file
-                    #                            },
-                    #                            1,
-                    #                            constants.TECHNIQUE_FILE_EXPLORATION, docker_url)
+ 
                 if filename.lower().endswith(".ipynb"):
                     notebook_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir,
                                                  repo_relative_path, filename)
diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py
index 8572da73..befc47f2 100644
--- a/src/somef/test/test_JSON_export.py
+++ b/src/somef/test/test_JSON_export.py
@@ -470,85 +470,4 @@ def test_issue_859(self):
         os.remove(test_data_path + "test-859.json")
 
 
-    def test_issue_725(self):
-            """Checks if this repository has authors extracted from Dockerfile"""
-
-            somef_cli.run_cli(threshold=0.8,
-                                ignore_classifiers=False,
-                                repo_url=None,
-                                local_repo=test_data_repositories + "Fairwinds",
-                                doc_src=None,
-                                in_file=None,
-                                output=test_data_path + "test_issue_725.json",
-                                graph_out=None,
-                                graph_format="turtle",
-                                codemeta_out=None,
-                                pretty=True,
-                                missing=False,
-                                readme_only=False)
-            
-            text_file = open(test_data_path + "test_issue_725.json", "r")
-            data = text_file.read()
-            text_file.close()
-            json_content = json.loads(data)
-
-            has_built = json_content.get("has_build_file", [])
-
-            authors = []
-            for entry in has_built:
-                result = entry.get("result", {})
-                if "author" in result:
-                    authors.extend(result["author"])
-    
-            expected_author = "FairwindsOps, Inc."
-
-            assert expected_author in authors, (
-                f"Expected author '{expected_author}' not found. "
-                f"Authors found: {authors}"
-            )
-            os.remove(test_data_path + "test_issue_725.json")
-
-    def test_issue_725_2(self):
-            """Checks if this repository has authors extracted from Dockerfile"""
-
-            somef_cli.run_cli(threshold=0.8,
-                                ignore_classifiers=False,
-                                repo_url=None,
-                                local_repo=test_data_repositories + "Prometeus",
-                                doc_src=None,
-                                in_file=None,
-                                output=test_data_path + "test_issue_725_2.json",
-                                graph_out=None,
-                                graph_format="turtle",
-                                codemeta_out=None,
-                                pretty=True,
-                                missing=False,
-                                readme_only=False)
-            
-            text_file = open(test_data_path + "test_issue_725_2.json", "r")
-            data = text_file.read()
-            text_file.close()
-            json_content = json.loads(data)
-
-            has_built = json_content.get("has_build_file", [])
-
-            authors = []
-            for entry in has_built:
-                result = entry.get("result", {})
-                if "author" in result:
-                    authors.extend(result["author"])
-    
-            expected_author = "The Prometheus Authors"
-
-            assert expected_author in authors, (
-                f"Expected author '{expected_author}' not found. "
-                f"Authors found: {authors}"
-            )
-            expected_count = 2
-            assert len(authors) == expected_count, (
-                f"Expected {expected_count} authors, but found {len(authors)}: {authors}"
-            )
-            os.remove(test_data_path + "test_issue_725_2.json")
-
-if __name__ == '__main__':
-    unittest.main()
+   
\ No newline at end of file
diff --git a/src/somef/test/test_dockerfile_parser.py b/src/somef/test/test_dockerfile_parser.py
new file mode 100644
index 00000000..fd95dfb7
--- /dev/null
+++ b/src/somef/test/test_dockerfile_parser.py
@@ -0,0 +1,190 @@
+import json
+import os
+import unittest
+from pathlib import Path
+from .. import somef_cli
+from ..utils import constants
+
+test_data_path = str(Path(__file__).parent / "test_data") + os.path.sep
+test_data_repositories = str(Path(__file__).parent / "test_data" / "repositories") + os.path.sep
+
+class TestDockerfileParser(unittest.TestCase): 
+
+    def test_issue_725(self):
+        """Checks if this repository has properties extracted from Dockerfile Fairwinds"""
+
+        somef_cli.run_cli(threshold=0.8,
+                            ignore_classifiers=False,
+                            repo_url=None,
+                            local_repo=test_data_repositories + "Fairwinds",
+                            doc_src=None,
+                            in_file=None,
+                            output=test_data_path + "test_issue_725.json",
+                            graph_out=None,
+                            graph_format="turtle",
+                            codemeta_out=None,
+                            pretty=True,
+                            missing=False,
+                            readme_only=False)
+        
+        text_file = open(test_data_path + "test_issue_725.json", "r")
+        data = text_file.read()
+        text_file.close()
+        json_content = json.loads(data)
+
+        owners = json_content.get("owner", [])
+
+        code_parser_owners = [
+            entry["result"]["value"]
+            for entry in owners
+            if entry.get("technique") == "code_parser"
+        ]
+
+        assert "FairwindsOps, Inc." in code_parser_owners, (
+            "Expected owner 'FairwindsOps, Inc.' extracted from Dockerfile "
+            f"with technique 'code_parser'. Found: {code_parser_owners}"
+        )
+
+        descriptions = json_content.get("description", [])
+        docker_descriptions = [
+            entry["result"]["value"]
+            for entry in descriptions
+            if entry.get("technique") == "code_parser"
+        ]
+
+        expected_description = (
+            "Nova is a cli tool to find outdated or deprecated Helm charts "
+            "running in your Kubernetes cluster."
+        )
+
+        assert expected_description in docker_descriptions, (
+            "Expected description extracted from Dockerfile not found.\n"
+            f"Expected: {expected_description}\n"
+            f"Found: {docker_descriptions}"
+        )
+
+        documentation = json_content.get("documentation", [])
+
+        doc_urls = [
+            entry["result"]["value"]
+            for entry in documentation
+            if entry.get("technique") == "code_parser"
+        ]
+
+        expected_doc = "https://nova.docs.fairwinds.com/"
+
+        assert expected_doc in doc_urls, (
+            f"Expected documentation URL '{expected_doc}' not found. "
+            f"Found: {doc_urls}"
+        )
+
+        authors = json_content.get("authors", [])
+
+        author_values = [
+            entry["result"]["value"]
+            for entry in authors
+            if entry.get("technique") == "code_parser"
+        ]
+
+        expected_author = "FairwindsOps, Inc."
+
+        assert expected_author in author_values, (
+            f"Expected author '{expected_author}' not found. "
+            f"Authors found: {author_values}"
+        )
+        os.remove(test_data_path + "test_issue_725.json")
+
+    def test_issue_725_2(self):
+        """Checks if this repository has properties extracted from Dockerfile Prometeus"""
+
+        somef_cli.run_cli(threshold=0.8,
+                            ignore_classifiers=False,
+                            repo_url=None,
+                            local_repo=test_data_repositories + "Prometeus",
+                            doc_src=None,
+                            in_file=None,
+                            output=test_data_path + "test_issue_725_2.json",
+                            graph_out=None,
+                            graph_format="turtle",
+                            codemeta_out=None,
+                            pretty=True,
+                            missing=False,
+                            readme_only=False)
+        
+        text_file = open(test_data_path + "test_issue_725_2.json", "r")
+        data = text_file.read()
+        text_file.close()
+        json_content = json.loads(data)
+
+        code_repos = json_content.get("code_repository", [])
+        code_parser_repos = [
+            entry["result"]["value"]
+            for entry in code_repos
+            if entry.get("technique") == "code_parser"
+        ]
+
+        expected_repo = "https://github.com/prometheus/prometheus"
+
+        assert expected_repo in code_parser_repos, (
+            f"Expected code_repository '{expected_repo}' extracted with technique "
+            f"'code_parser'. Found: {code_parser_repos}"
+        )
+
+        licenses = json_content.get("license", [])
+        code_parser_licenses = [
+            entry["result"]
+            for entry in licenses
+            if entry.get("technique") == "code_parser"
+        ]
+
+        assert any(
+            lic.get("spdx_id") == "Apache-2.0"
+            for lic in code_parser_licenses
+        ), (
+            "Expected license with SPDX ID 'Apache-2.0' extracted from Dockerfile "
+            f"using 'code_parser'. Found: {code_parser_licenses}"
+        )
+
+        descriptions = json_content.get("description", [])
+        code_parser_descriptions = [
+            entry["result"]["value"]
+            for entry in descriptions
+            if entry.get("technique") == "code_parser"
+        ]
+
+        expected_description = "The Prometheus monitoring system and time series database"
+        assert expected_description in code_parser_descriptions, (
+            "Expected description extracted from Dockerfile not found.\n"
+            f"Expected: {expected_description}\n"
+            f"Found: {code_parser_descriptions}"
+        )
+
+        names = json_content.get("name", [])
+        code_parser_names = [
+            entry["result"]["value"]
+            for entry in names
+            if entry.get("technique") == "code_parser"
+        ]
+
+        expected_name = "Prometheus"
+        assert expected_name in code_parser_names, (
+            f"Expected name '{expected_name}' extracted from Dockerfile "
+            f"using 'code_parser'. Found: {code_parser_names}"
+        )
+
+        documentation = json_content.get("documentation", [])
+        code_parser_docs = [
+            entry["result"]["value"]
+            for entry in documentation
+            if entry.get("technique") == "code_parser"
+        ]
+
+        expected_doc = "https://prometheus.io/docs"
+        assert expected_doc in code_parser_docs, (
+            f"Expected documentation URL '{expected_doc}' extracted from Dockerfile "
+            f"using 'code_parser'. Found: {code_parser_docs}"
+        )
+        os.remove(test_data_path + "test_issue_725_2.json")
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py
index 7db3d9e3..1829bc80 100644
--- a/src/somef/utils/constants.py
+++ b/src/somef/utils/constants.py
@@ -85,7 +85,8 @@
 REGEXP_TITLE_NATURAL = r'["“](.+?)["”]'
 
 #License spdx
-REGEXP_APACHE = r'(?i)apache\s+license\s*,?\s*version\s*2\.0'
+# REGEXP_APACHE = r'(?i)apache\s+license\s*,?\s*version\s*2\.0'
+REGEXP_APACHE = r'(?i)apache(?:\s+license)?\s*(?:,?\s*version\s*)?2\.0'
 REGEXP_GPL3 = r'(?i)gnu\s+general\s+public\s+license\s*,?\s*version\s*3\.0'
 REGEXP_MIT = r'(?i)mit\s+license'
 REGEXP_BSD2 = r'(?i)(bsd\s*-?\s*2-?clause(?:\s*license)?|redistribution\s+and\s+use\s+in\s+source\s+and\s+binary\s+forms)'
@@ -460,4 +461,13 @@ class RepositoryType(Enum):
 # REGEXP_MAINTAINER_LABEL_OCI = r'^\s*LABEL\s+org\.opencontainers\.image\.authors\s*=\s*["\']?(.+?)["\']?\s*$'
 REGEXP_MAINTAINER_LABEL_OCI = r'^\s*LABEL\s+org\.opencontainers\.image\.authors\s*=\s*["\']([^"\'\\]+)["\']?\s*(?:\\)?\s*$'
 REGEXP_MAINTAINER_LABEL_FREE = r'^\s*LABEL\s+"?maintainer"?\s*=\s*["\']?(.+?)["\']?\s*$'
-REGEXP_MAINTAINER = r'^\s*MAINTAINER\s+(.+)$'
\ No newline at end of file
+REGEXP_MAINTAINER = r'^\s*MAINTAINER\s+(.+)$'
+REGEXP_DOCKER_TITLE = r'org\.opencontainers\.image\.title\s*=\s*"([^"]+)"'
+REGEXP_DOCKER_DESCRIPTION = r'org\.opencontainers\.image\.description\s*=\s*"([^"]+)"'
+REGEXP_DOCKER_LICENSES = r'org\.opencontainers\.image\.licenses\s*=\s*"([^"]+)"'
+REGEXP_DOCKER_SOURCE   = r'org\.opencontainers\.image\.source\s*=\s*"([^"]+)"'
+REGEXP_DOCKER_URL      = r'org\.opencontainers\.image\.url\s*=\s*"([^"]+)"'
+REGEXP_DOCKER_VERSION = r'org\.opencontainers\.image\.version\s*=\s*"([^"]+)"'
+REGEXP_DOCKER_DOCUMENTATION = r'org\.opencontainers\.image\.documentation\s*=\s*"([^"]+)"'
+REGEXP_DOCKER_VENDOR = r'org\.opencontainers\.image\.vendor\s*=\s*"([^"]+)"'
+REGEXP_DOCKER_CREATED_DATE = r'org\.opencontainers\.image\.created\s*=\s*"([^"]+)"'