KnowledgeCaptureAndDiscovery · dgarijo · Jan 14, 2026 · Jan 14, 2026
diff --git a/docs/dockerfiledoc.md b/docs/dockerfiledoc.md
@@ -0,0 +1,30 @@
+The following metadata fields can be extracted from a Dockerfile.
+These fields are defined using Dockerfile `LABEL` instructions as described in the
+[Dockerfile reference](https://docs.docker.com/reference/dockerfile/) and are interpreted
+according to the OCI Image Specification, following the
+[mapping for OCI image annotations](https://github.com/opencontainers/image-spec/blob/main/annotations.md#pre-defined-annotation-keys).
+
+| Software metadata category  | SOMEF metadata JSON path                | DOCKERFILE metadata file field     |
+|-----------------------------|-----------------------------------------|------------------------------------| 
+| authors                       |     authors[i].result.value           |   org.opencontainers.image.authors *(1)*  |
+| authors                       |     authors[i].result.value           |   LABEL maintainer *(1)*  |
+| code_repository               |     code_repository[i].result.value   |   org.opencontainers.image.url     |
+| description                   |     description[i].result.value       |     org.opencontainers.image.description    |
+| documentation                 |     documentation[i].result.value     |   org.opencontainers.image.documentation    |
+| license                       |     license[i].result.value           |     org.opencontainers.image.licenses    |
+| name                          |     name[i].result.value              |     org.opencontainers.image.ref.name         |
+| owner                         |     owner[i].result.value             |   org.opencontainers.image.vendor    |
+| version                       |     version[i].result.value           |   org.opencontainers.image.version     |
+
+
+---
+
+
+*(1)*  
+- Example: 
+```
+LABEL maintainer="The Prometheus Authors <[email protected]>"
+LABEL org.opencontainers.image.authors="The Prometheus Authors" \
+```
+
+
diff --git a/docs/supported_metadata_files.md b/docs/supported_metadata_files.md
@@ -24,6 +24,8 @@ SOMEF can extract metadata from a wide range of files commonly found in software
 | `*.gemspec`        | Ruby                       | Manifest file serves as the package descriptor used in Ruby gem projects. | <div align="center">[🔍](./gemspec.md)</div>| [📄](https://guides.rubygems.org/specification-reference/)| |[Example](https://github.com/rubygems/rubygems/blob/master/bundler/bundler.gemspec) |
 | `cargo.toml`       | Rust                       | Manifest file serves as the package descriptor used in Rust projects | <div align="center">[🔍](./cargo.md)</div> | [📄](https://doc.rust-lang.org/cargo/reference/manifest.html)| |[Example](https://github.com/rust-lang/cargo/blob/master/Cargo.toml) |
 | `*.cabal`       | Haskell                       | Manifest file serving as the package descriptor for Haskell projects.| <div align="center">[🔍](./cabal.md)</div> | [📄](https://cabal.readthedocs.io/en/3.10/cabal-package.html)| |[Example](https://github.com/haskell/cabal/blob/master/Cabal/Cabal.cabal) |
+| `dockerfile`       | Dockerfile                       | Build specification file for container images that can include software metadata via LABEL instructions (OCI specification).| <div align="center">[🔍](./dockerfiledoc.md)</div> | [📄](https://docs.docker.com/reference/dockerfile/)| |[Example](https://github.com/FairwindsOps/nova/blob/master/Dockerfile) |
+
 
 > **Note:** The general principles behind metadata mapping in SOMEF are based on the [CodeMeta crosswalk](https://github.com/codemeta/codemeta/blob/master/crosswalk.csv) and the [CodeMeta JSON-LD context](https://github.com/codemeta/codemeta/blob/master/codemeta.jsonld).  
 > However, each supported file type may have specific characteristics and field interpretations.

diff --git a/src/somef/parser/dockerfile_parser.py b/src/somef/parser/dockerfile_parser.py
@@ -2,46 +2,226 @@
 import os
 import re
 from ..utils import constants
+from ..process_results import Result
+
+def parse_dockerfile(file_path, metadata_result: Result, source):
+
+    print(f"Extracting properties from Dockerfile: {file_path}")
 
-def extract_dockerfile_maintainer(file_path):
-    print(f"Extracting maintainers from Dockerfile: {file_path}")
-    maintainers = []
-    unique_maintainers = [] 
     try:
         with open(file_path, "rb") as file:
             raw_data = file.read()
 
-        try:
-            content = raw_data.decode("utf-8")
-        except UnicodeDecodeError:
-            logging.warning(f"File {file_path} is not UTF-8 decodable. Skipping.")
-            return maintainers
-
-        # not sure if should be better property author or a new property of maintainer
-        oci_match = re.findall(
-            constants.REGEXP_MAINTAINER_LABEL_OCI,
-            content,
-            re.IGNORECASE | re.MULTILINE
+        content = raw_data.decode("utf-8")
+    except (OSError, UnicodeDecodeError) as e:
+        logging.warning(f"Could not process Dockerfile {file_path}: {e}")
+        return None
+
+    # print(content)
+    title_match = re.search(
+        constants.REGEXP_DOCKER_TITLE,
+        content,
+        re.IGNORECASE
+    )
+
+    if title_match:
+        title = title_match.group(1).strip()
+        if title:
+            metadata_result.add_result(
+                constants.CAT_NAME,
+                {
+                    "value": title,
+                    "type": constants.STRING
+                },
+                1,
+                constants.TECHNIQUE_CODE_CONFIG_PARSER,
+                source
+            )
+
+    description_match = re.search(
+        constants.REGEXP_DOCKER_DESCRIPTION,
+        content,
+        re.IGNORECASE
+    )
+
+    if description_match:
+        description = description_match.group(1).strip()
+        if description:
+            metadata_result.add_result(
+                constants.CAT_DESCRIPTION,
+                {
+                    "value": description,
+                    "type": constants.STRING
+                },
+                1,
+                constants.TECHNIQUE_CODE_CONFIG_PARSER,
+                source
+            )
+
+    licenses_match = re.search(constants.REGEXP_DOCKER_LICENSES, content, re.IGNORECASE)
+    if licenses_match:
+        license_info_spdx = detect_license_spdx(licenses_match.group(1).strip())
+
+        if license_info_spdx:
+            license_data = {
+                "value": licenses_match.group(1).strip(),
+                "spdx_id": license_info_spdx.get('spdx_id'),
+                "name": license_info_spdx.get('name'),
+                "type": constants.LICENSE
+            }
+        else:
+            license_data = {
+                "value": licenses_match.group(1).strip(),
+                "type": constants.LICENSE
+            }
+        metadata_result.add_result(
+            constants.CAT_LICENSE,
+            license_data,
+            1,
+            constants.TECHNIQUE_CODE_CONFIG_PARSER,
+            source
+        )
+
+
+    # source_match = re.search(constants.REGEXP_DOCKER_SOURCE, content, re.IGNORECASE)
+    # if source_match:
+    #     properties[constants.PROP_SOURCE] = source_match.group(1).strip()
+
+    url_match = re.search(constants.REGEXP_DOCKER_URL, content, re.IGNORECASE)
+    if url_match:
+        metadata_result.add_result(
+            constants.CAT_CODE_REPOSITORY,
+            {
+                "value": url_match.group(1).strip(),
+                "type": constants.URL
+            },
+            1,
+            constants.TECHNIQUE_CODE_CONFIG_PARSER,
+            source
+        )
+
+    version_match = re.search(constants.REGEXP_DOCKER_VERSION, content, re.IGNORECASE)
+    if version_match:
+            metadata_result.add_result(
+            constants.CAT_VERSION,
+            {
+                "value": version_match.group(1).strip(),
+                "type": constants.RELEASE,
+                "tag": version_match.group(1).strip()
+            },
+            1,
+            constants.TECHNIQUE_CODE_CONFIG_PARSER,
+            source
         )
-        # LABEL maintainer free
-        label_match = re.findall(
-            constants.REGEXP_MAINTAINER_LABEL_FREE,
-            content,
-            re.IGNORECASE | re.MULTILINE
+
+    documentation_match = re.search(constants.REGEXP_DOCKER_DOCUMENTATION, content, re.IGNORECASE)
+    if documentation_match:
+        metadata_result.add_result(
+            constants.CAT_DOCUMENTATION,
+            {
+                "value": documentation_match.group(1).strip(),
+                "type": constants.STRING
+            },
+            1,
+            constants.TECHNIQUE_CODE_CONFIG_PARSER,
+            source
         )
-        # Deprecated maintainer
-        maintainer_match = re.findall(
-            constants.REGEXP_MAINTAINER,
-            content,
-            re.IGNORECASE | re.MULTILINE
+
+
+    vendor_match = re.search(
+        constants.REGEXP_DOCKER_VENDOR,
+        content,
+        re.IGNORECASE
+    )
+
+    if vendor_match:
+        vendor = vendor_match.group(1).strip()
+        if vendor:
+            if vendor and re.search(constants.REGEXP_LTD_INC, vendor, re.IGNORECASE):
+                type_vendor = "Organization"
+            else:
+                type_vendor = "Person"
+
+            metadata_result.add_result(
+                constants.CAT_OWNER,
+                {
+                    "value": vendor,
+                    "type": type_vendor
+                },
+                1,
+                constants.TECHNIQUE_CODE_CONFIG_PARSER,
+                source
+            )
+
+    # Extract maintainers
+    maintainers = []
+    unique_maintainers = [] 
+
+    maintainer_oci_match = re.findall(
+        constants.REGEXP_MAINTAINER_LABEL_OCI,
+        content,
+        re.IGNORECASE | re.MULTILINE
+    )
+    # LABEL maintainer free
+    maintanainer_label_match = re.findall(
+        constants.REGEXP_MAINTAINER_LABEL_FREE,
+        content,
+        re.IGNORECASE | re.MULTILINE
+    )
+    # Deprecated maintainer
+    maintainer_match = re.findall(
+        constants.REGEXP_MAINTAINER,
+        content,
+        re.IGNORECASE | re.MULTILINE
+    )
+    maintainers.extend(maintainer_oci_match)
+    maintainers.extend(maintanainer_label_match)
+    maintainers.extend(maintainer_match)
+
+    unique_maintainers = list({m.strip() for m in maintainers if m.strip()})
+
+    for maintainer in unique_maintainers:         
+        metadata_result.add_result(
+            constants.CAT_AUTHORS,
+            {
+                "type": constants.AGENT,
+                "value": maintainer
+            },
+            1,
+            constants.TECHNIQUE_CODE_CONFIG_PARSER,
+            source
         )
 
-        maintainers.extend(oci_match)
-        maintainers.extend(label_match)
-        maintainers.extend(maintainer_match)
+    return metadata_result
+
+def detect_license_spdx(license_text):
+    """
+    Function that given a license text, infers the name and spdx id in a dockerfile
+    Parameters
+    ----------
+    license_text
+
+    Returns
+    -------
+    A JSON dictionary with name and spdx id
+    """
+    print("Detecting license from text:", license_text)
+    for license_name, license_info in constants.LICENSES_DICT.items():
+        if re.search(license_info["regex"], license_text, re.IGNORECASE):
+            return {
+                "name": license_name,
+                "spdx_id": f"{license_info['spdx_id']}",
+                "@id": f"https://spdx.org/licenses/{license_info['spdx_id']}"
+            }
 
-        unique_maintainers = list({m.strip() for m in maintainers if m.strip()})
-    except OSError:
-        logging.warning(f"Could not read Dockerfile {file_path}")
+    for license_name, license_info in constants.LICENSES_DICT.items():
+        spdx_id = license_info["spdx_id"]
+        if re.search(rf'\b{re.escape(spdx_id)}\b', license_text, re.IGNORECASE):
+            return {
+                "name": license_name,
+                "spdx_id": spdx_id,
+                "@id": f"https://spdx.org/licenses/{spdx_id}"
+            }
+    return None
 
-    return unique_maintainers
+
diff --git a/src/somef/process_files.py b/src/somef/process_files.py
@@ -21,7 +21,7 @@
 from .parser.description_parser import parse_description_file
 from .parser.toml_parser import parse_toml_file
 from .parser.cabal_parser import parse_cabal_file
-from .parser.dockerfile_parser import extract_dockerfile_maintainer
+from .parser.dockerfile_parser import parse_dockerfile
 from chardet import detect
 
 
@@ -76,20 +76,22 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
                 if filename == "Dockerfile" or filename.lower() == "docker-compose.yml":
                     docker_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir,
                                                repo_relative_path, filename)
-                    if filename == "Dockerfile":
-                        format_file = constants.FORMAT_DOCKERFILE
-                        maintainers = extract_dockerfile_maintainer(os.path.join(repo_dir, file_path))
-                    else:
-                        format_file = constants.FORMAT_DOCKER_COMPOSE
-                        maintainers = None
+
+                    # full_path = os.path.join(repo_dir, file_path)
 
                     result_value = {
                         constants.PROP_VALUE: docker_url,
                         constants.PROP_TYPE: constants.URL,
-                        constants.PROP_FORMAT: format_file
                     }
-                    if maintainers:
-                        result_value[constants.PROP_AUTHOR] = maintainers
+
+                    if filename == "Dockerfile":
+                        format_file = constants.FORMAT_DOCKERFILE
+                        result_value[constants.PROP_FORMAT] = format_file
+                        metadata_result = parse_dockerfile(os.path.join(dir_path, filename), metadata_result, docker_url)
+                    else:
+                        format_file = constants.FORMAT_DOCKER_COMPOSE
+
+                    result_value[constants.PROP_FORMAT] = format_file
 
                     metadata_result.add_result(
                         constants.CAT_HAS_BUILD_FILE,
@@ -98,14 +100,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
                         constants.TECHNIQUE_FILE_EXPLORATION,
                         docker_url
                     )
-                    # metadata_result.add_result(constants.CAT_HAS_BUILD_FILE,
-                    #                            {
-                    #                                constants.PROP_VALUE: docker_url,
-                    #                                constants.PROP_TYPE: constants.URL,
-                    #                                constants.PROP_FORMAT: format_file
-                    #                            },
-                    #                            1,
-                    #                            constants.TECHNIQUE_FILE_EXPLORATION, docker_url)
+
                 if filename.lower().endswith(".ipynb"):
                     notebook_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir,
                                                  repo_relative_path, filename)