Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions docs/dockerfiledoc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
The following metadata fields can be extracted from a Dockerfile.
These fields are defined using Dockerfile `LABEL` instructions as described in the
[Dockerfile reference](https://docs.docker.com/reference/dockerfile/) and are interpreted
according to the OCI Image Specification, following the
[mapping for OCI image annotations](https://github.com/opencontainers/image-spec/blob/main/annotations.md#pre-defined-annotation-keys).

| Software metadata category | SOMEF metadata JSON path | DOCKERFILE metadata file field |
|-----------------------------|-----------------------------------------|------------------------------------|
| authors | authors[i].result.value | org.opencontainers.image.authors *(1)* |
| authors | authors[i].result.value | LABEL maintainer *(1)* |
| code_repository | code_repository[i].result.value | org.opencontainers.image.url |
| description | description[i].result.value | org.opencontainers.image.description |
| documentation | documentation[i].result.value | org.opencontainers.image.documentation |
| license | license[i].result.value | org.opencontainers.image.licenses |
| name | name[i].result.value | org.opencontainers.image.ref.name |
| owner | owner[i].result.value | org.opencontainers.image.vendor |
| version | version[i].result.value | org.opencontainers.image.version |


---


*(1)*
- Example:
```
LABEL maintainer="The Prometheus Authors <[email protected]>"
LABEL org.opencontainers.image.authors="The Prometheus Authors" \
```


2 changes: 2 additions & 0 deletions docs/supported_metadata_files.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ SOMEF can extract metadata from a wide range of files commonly found in software
| `*.gemspec` | Ruby | Manifest file serves as the package descriptor used in Ruby gem projects. | <div align="center">[🔍](./gemspec.md)</div>| [📄](https://guides.rubygems.org/specification-reference/)| |[Example](https://github.com/rubygems/rubygems/blob/master/bundler/bundler.gemspec) |
| `cargo.toml` | Rust | Manifest file serves as the package descriptor used in Rust projects | <div align="center">[🔍](./cargo.md)</div> | [📄](https://doc.rust-lang.org/cargo/reference/manifest.html)| |[Example](https://github.com/rust-lang/cargo/blob/master/Cargo.toml) |
| `*.cabal` | Haskell | Manifest file serving as the package descriptor for Haskell projects.| <div align="center">[🔍](./cabal.md)</div> | [📄](https://cabal.readthedocs.io/en/3.10/cabal-package.html)| |[Example](https://github.com/haskell/cabal/blob/master/Cabal/Cabal.cabal) |
| `dockerfile` | Dockerfile | Build specification file for container images that can include software metadata via LABEL instructions (OCI specification).| <div align="center">[🔍](./dockerfiledoc.md)</div> | [📄](https://docs.docker.com/reference/dockerfile/)| |[Example](https://github.com/FairwindsOps/nova/blob/master/Dockerfile) |


> **Note:** The general principles behind metadata mapping in SOMEF are based on the [CodeMeta crosswalk](https://github.com/codemeta/codemeta/blob/master/crosswalk.csv) and the [CodeMeta JSON-LD context](https://github.com/codemeta/codemeta/blob/master/codemeta.jsonld).
> However, each supported file type may have specific characteristics and field interpretations.
Expand Down
244 changes: 212 additions & 32 deletions src/somef/parser/dockerfile_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,46 +2,226 @@
import os
import re
from ..utils import constants
from ..process_results import Result

def parse_dockerfile(file_path, metadata_result: Result, source):

print(f"Extracting properties from Dockerfile: {file_path}")

def extract_dockerfile_maintainer(file_path):
print(f"Extracting maintainers from Dockerfile: {file_path}")
maintainers = []
unique_maintainers = []
try:
with open(file_path, "rb") as file:
raw_data = file.read()

try:
content = raw_data.decode("utf-8")
except UnicodeDecodeError:
logging.warning(f"File {file_path} is not UTF-8 decodable. Skipping.")
return maintainers

# not sure if should be better property author or a new property of maintainer
oci_match = re.findall(
constants.REGEXP_MAINTAINER_LABEL_OCI,
content,
re.IGNORECASE | re.MULTILINE
content = raw_data.decode("utf-8")
except (OSError, UnicodeDecodeError) as e:
logging.warning(f"Could not process Dockerfile {file_path}: {e}")
return None

# print(content)
title_match = re.search(
constants.REGEXP_DOCKER_TITLE,
content,
re.IGNORECASE
)

if title_match:
title = title_match.group(1).strip()
if title:
metadata_result.add_result(
constants.CAT_NAME,
{
"value": title,
"type": constants.STRING
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

description_match = re.search(
constants.REGEXP_DOCKER_DESCRIPTION,
content,
re.IGNORECASE
)

if description_match:
description = description_match.group(1).strip()
if description:
metadata_result.add_result(
constants.CAT_DESCRIPTION,
{
"value": description,
"type": constants.STRING
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

licenses_match = re.search(constants.REGEXP_DOCKER_LICENSES, content, re.IGNORECASE)
if licenses_match:
license_info_spdx = detect_license_spdx(licenses_match.group(1).strip())

if license_info_spdx:
license_data = {
"value": licenses_match.group(1).strip(),
"spdx_id": license_info_spdx.get('spdx_id'),
"name": license_info_spdx.get('name'),
"type": constants.LICENSE
}
else:
license_data = {
"value": licenses_match.group(1).strip(),
"type": constants.LICENSE
}
metadata_result.add_result(
constants.CAT_LICENSE,
license_data,
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)


# source_match = re.search(constants.REGEXP_DOCKER_SOURCE, content, re.IGNORECASE)
# if source_match:
# properties[constants.PROP_SOURCE] = source_match.group(1).strip()

url_match = re.search(constants.REGEXP_DOCKER_URL, content, re.IGNORECASE)
if url_match:
metadata_result.add_result(
constants.CAT_CODE_REPOSITORY,
{
"value": url_match.group(1).strip(),
"type": constants.URL
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

version_match = re.search(constants.REGEXP_DOCKER_VERSION, content, re.IGNORECASE)
if version_match:
metadata_result.add_result(
constants.CAT_VERSION,
{
"value": version_match.group(1).strip(),
"type": constants.RELEASE,
"tag": version_match.group(1).strip()
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)
# LABEL maintainer free
label_match = re.findall(
constants.REGEXP_MAINTAINER_LABEL_FREE,
content,
re.IGNORECASE | re.MULTILINE

documentation_match = re.search(constants.REGEXP_DOCKER_DOCUMENTATION, content, re.IGNORECASE)
if documentation_match:
metadata_result.add_result(
constants.CAT_DOCUMENTATION,
{
"value": documentation_match.group(1).strip(),
"type": constants.STRING
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)
# Deprecated maintainer
maintainer_match = re.findall(
constants.REGEXP_MAINTAINER,
content,
re.IGNORECASE | re.MULTILINE


vendor_match = re.search(
constants.REGEXP_DOCKER_VENDOR,
content,
re.IGNORECASE
)

if vendor_match:
vendor = vendor_match.group(1).strip()
if vendor:
if vendor and re.search(constants.REGEXP_LTD_INC, vendor, re.IGNORECASE):
type_vendor = "Organization"
else:
type_vendor = "Person"

metadata_result.add_result(
constants.CAT_OWNER,
{
"value": vendor,
"type": type_vendor
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

# Extract maintainers
maintainers = []
unique_maintainers = []

maintainer_oci_match = re.findall(
constants.REGEXP_MAINTAINER_LABEL_OCI,
content,
re.IGNORECASE | re.MULTILINE
)
# LABEL maintainer free
maintanainer_label_match = re.findall(
constants.REGEXP_MAINTAINER_LABEL_FREE,
content,
re.IGNORECASE | re.MULTILINE
)
# Deprecated maintainer
maintainer_match = re.findall(
constants.REGEXP_MAINTAINER,
content,
re.IGNORECASE | re.MULTILINE
)
maintainers.extend(maintainer_oci_match)
maintainers.extend(maintanainer_label_match)
maintainers.extend(maintainer_match)

unique_maintainers = list({m.strip() for m in maintainers if m.strip()})

for maintainer in unique_maintainers:
metadata_result.add_result(
constants.CAT_AUTHORS,
{
"type": constants.AGENT,
"value": maintainer
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

maintainers.extend(oci_match)
maintainers.extend(label_match)
maintainers.extend(maintainer_match)
return metadata_result

def detect_license_spdx(license_text):
"""
Function that given a license text, infers the name and spdx id in a dockerfile
Parameters
----------
license_text

Returns
-------
A JSON dictionary with name and spdx id
"""
print("Detecting license from text:", license_text)
for license_name, license_info in constants.LICENSES_DICT.items():
if re.search(license_info["regex"], license_text, re.IGNORECASE):
return {
"name": license_name,
"spdx_id": f"{license_info['spdx_id']}",
"@id": f"https://spdx.org/licenses/{license_info['spdx_id']}"
}

unique_maintainers = list({m.strip() for m in maintainers if m.strip()})
except OSError:
logging.warning(f"Could not read Dockerfile {file_path}")
for license_name, license_info in constants.LICENSES_DICT.items():
spdx_id = license_info["spdx_id"]
if re.search(rf'\b{re.escape(spdx_id)}\b', license_text, re.IGNORECASE):
return {
"name": license_name,
"spdx_id": spdx_id,
"@id": f"https://spdx.org/licenses/{spdx_id}"
}
return None

return unique_maintainers

31 changes: 13 additions & 18 deletions src/somef/process_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from .parser.description_parser import parse_description_file
from .parser.toml_parser import parse_toml_file
from .parser.cabal_parser import parse_cabal_file
from .parser.dockerfile_parser import extract_dockerfile_maintainer
from .parser.dockerfile_parser import parse_dockerfile
from chardet import detect


Expand Down Expand Up @@ -76,20 +76,22 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
if filename == "Dockerfile" or filename.lower() == "docker-compose.yml":
docker_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir,
repo_relative_path, filename)
if filename == "Dockerfile":
format_file = constants.FORMAT_DOCKERFILE
maintainers = extract_dockerfile_maintainer(os.path.join(repo_dir, file_path))
else:
format_file = constants.FORMAT_DOCKER_COMPOSE
maintainers = None

# full_path = os.path.join(repo_dir, file_path)

result_value = {
constants.PROP_VALUE: docker_url,
constants.PROP_TYPE: constants.URL,
constants.PROP_FORMAT: format_file
}
if maintainers:
result_value[constants.PROP_AUTHOR] = maintainers

if filename == "Dockerfile":
format_file = constants.FORMAT_DOCKERFILE
result_value[constants.PROP_FORMAT] = format_file
metadata_result = parse_dockerfile(os.path.join(dir_path, filename), metadata_result, docker_url)
else:
format_file = constants.FORMAT_DOCKER_COMPOSE

result_value[constants.PROP_FORMAT] = format_file

metadata_result.add_result(
constants.CAT_HAS_BUILD_FILE,
Expand All @@ -98,14 +100,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
constants.TECHNIQUE_FILE_EXPLORATION,
docker_url
)
# metadata_result.add_result(constants.CAT_HAS_BUILD_FILE,
# {
# constants.PROP_VALUE: docker_url,
# constants.PROP_TYPE: constants.URL,
# constants.PROP_FORMAT: format_file
# },
# 1,
# constants.TECHNIQUE_FILE_EXPLORATION, docker_url)

if filename.lower().endswith(".ipynb"):
notebook_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir,
repo_relative_path, filename)
Expand Down
Loading