Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions src/somef/parser/dockerfile_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import logging
import os
import re
from ..utils import constants

def extract_dockerfile_maintainer(file_path):
print(f"Extracting maintainers from Dockerfile: {file_path}")
maintainers = []
unique_maintainers = []
try:
with open(file_path, "rb") as file:
raw_data = file.read()

try:
content = raw_data.decode("utf-8")
except UnicodeDecodeError:
logging.warning(f"File {file_path} is not UTF-8 decodable. Skipping.")
return maintainers

# not sure if should be better property author or a new property of maintainer
oci_match = re.findall(
constants.REGEXP_MAINTAINER_LABEL_OCI,
content,
re.IGNORECASE | re.MULTILINE
)
# LABEL maintainer free
label_match = re.findall(
constants.REGEXP_MAINTAINER_LABEL_FREE,
content,
re.IGNORECASE | re.MULTILINE
)
# Deprecated maintainer
maintainer_match = re.findall(
constants.REGEXP_MAINTAINER,
content,
re.IGNORECASE | re.MULTILINE
)

maintainers.extend(oci_match)
maintainers.extend(label_match)
maintainers.extend(maintainer_match)

unique_maintainers = list({m.strip() for m in maintainers if m.strip()})
except OSError:
logging.warning(f"Could not read Dockerfile {file_path}")

return unique_maintainers
82 changes: 27 additions & 55 deletions src/somef/process_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from .parser.description_parser import parse_description_file
from .parser.toml_parser import parse_toml_file
from .parser.cabal_parser import parse_cabal_file
from .parser.dockerfile_parser import extract_dockerfile_maintainer
from chardet import detect


Expand Down Expand Up @@ -77,16 +78,34 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
repo_relative_path, filename)
if filename == "Dockerfile":
format_file = constants.FORMAT_DOCKERFILE
maintainers = extract_dockerfile_maintainer(os.path.join(repo_dir, file_path))
else:
format_file = constants.FORMAT_DOCKER_COMPOSE
metadata_result.add_result(constants.CAT_HAS_BUILD_FILE,
{
constants.PROP_VALUE: docker_url,
constants.PROP_TYPE: constants.URL,
constants.PROP_FORMAT: format_file
},
1,
constants.TECHNIQUE_FILE_EXPLORATION, docker_url)
maintainers = None

result_value = {
constants.PROP_VALUE: docker_url,
constants.PROP_TYPE: constants.URL,
constants.PROP_FORMAT: format_file
}
if maintainers:
result_value[constants.PROP_AUTHOR] = maintainers

metadata_result.add_result(
constants.CAT_HAS_BUILD_FILE,
result_value,
1,
constants.TECHNIQUE_FILE_EXPLORATION,
docker_url
)
# metadata_result.add_result(constants.CAT_HAS_BUILD_FILE,
# {
# constants.PROP_VALUE: docker_url,
# constants.PROP_TYPE: constants.URL,
# constants.PROP_FORMAT: format_file
# },
# 1,
# constants.TECHNIQUE_FILE_EXPLORATION, docker_url)
if filename.lower().endswith(".ipynb"):
notebook_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir,
repo_relative_path, filename)
Expand Down Expand Up @@ -652,50 +671,3 @@ def clean_text(text):
cleaned_lines.append(line)
return "\n".join(cleaned_lines)

# """
# Proccess a text with possible authors
# """
# if not author_str:
# return []

# authors = []

# for line in author_str.splitlines():
# line = line.strip()
# if not line or line.startswith("#"):
# continue

# email_match = re.search(r'<([^>]+)>', line)
# if email_match:
# email = email_match.group(1)
# name = line[:email_match.start()].strip()
# else:
# name = line
# email = None

# if name:
# if re.search(constants.REGEXP_LTD_INC, name, re.IGNORECASE):
# type_author = "Organization"
# author_info = {
# "name": name,
# "email": email,
# "value": name,
# "type": type_author
# }
# else:
# type_author = "Person"
# name_parts = name.split()
# given_name = name_parts[0] if name_parts else None
# last_name = " ".join(name_parts[1:]) if len(name_parts) > 1 else None
# author_info = {
# "name": name,
# "email": email,
# "value": name,
# "type": type_author,
# "given_name": given_name,
# "last_name": last_name
# }

# authors.append(author_info)

# return authors
80 changes: 80 additions & 0 deletions src/somef/test/test_JSON_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,5 +470,85 @@ def test_issue_859(self):
os.remove(test_data_path + "test-859.json")


def test_issue_725(self):
"""Checks if this repository has authors extracted from Dockerfile"""

somef_cli.run_cli(threshold=0.8,
ignore_classifiers=False,
repo_url=None,
local_repo=test_data_repositories + "Fairwinds",
doc_src=None,
in_file=None,
output=test_data_path + "test_issue_725.json",
graph_out=None,
graph_format="turtle",
codemeta_out=None,
pretty=True,
missing=False,
readme_only=False)

text_file = open(test_data_path + "test_issue_725.json", "r")
data = text_file.read()
text_file.close()
json_content = json.loads(data)

has_built = json_content.get("has_build_file", [])

authors = []
for entry in has_built:
result = entry.get("result", {})
if "author" in result:
authors.extend(result["author"])

expected_author = "FairwindsOps, Inc."

assert expected_author in authors, (
f"Expected author '{expected_author}' not found. "
f"Authors found: {authors}"
)
os.remove(test_data_path + "test_issue_725.json")

def test_issue_725_2(self):
"""Checks if this repository has authors extracted from Dockerfile"""

somef_cli.run_cli(threshold=0.8,
ignore_classifiers=False,
repo_url=None,
local_repo=test_data_repositories + "Prometeus",
doc_src=None,
in_file=None,
output=test_data_path + "test_issue_725_2.json",
graph_out=None,
graph_format="turtle",
codemeta_out=None,
pretty=True,
missing=False,
readme_only=False)

text_file = open(test_data_path + "test_issue_725_2.json", "r")
data = text_file.read()
text_file.close()
json_content = json.loads(data)

has_built = json_content.get("has_build_file", [])

authors = []
for entry in has_built:
result = entry.get("result", {})
if "author" in result:
authors.extend(result["author"])

expected_author = "The Prometheus Authors"

assert expected_author in authors, (
f"Expected author '{expected_author}' not found. "
f"Authors found: {authors}"
)
expected_count = 2
assert len(authors) == expected_count, (
f"Expected {expected_count} authors, but found {len(authors)}: {authors}"
)
os.remove(test_data_path + "test_issue_725_2.json")

if __name__ == '__main__':
unittest.main()
14 changes: 14 additions & 0 deletions src/somef/test/test_data/repositories/Fairwinds/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM alpine:3.23

LABEL org.opencontainers.image.authors="FairwindsOps, Inc." \
org.opencontainers.image.vendor="FairwindsOps, Inc." \
org.opencontainers.image.title="Nova" \
org.opencontainers.image.description="Nova is a cli tool to find outdated or deprecated Helm charts running in your Kubernetes cluster." \
org.opencontainers.image.documentation="https://nova.docs.fairwinds.com/" \
org.opencontainers.image.source="https://github.com/FairwindsOps/nova" \
org.opencontainers.image.url="https://github.com/FairwindsOps/nova" \
org.opencontainers.image.licenses="Apache License 2.0"

USER nobody
COPY nova /
CMD ["/nova"]
31 changes: 31 additions & 0 deletions src/somef/test/test_data/repositories/Prometeus/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
ARG ARCH="amd64"
ARG OS="linux"
FROM quay.io/prometheus/busybox-${OS}-${ARCH}:latest
LABEL maintainer="The Prometheus Authors <[email protected]>"
LABEL org.opencontainers.image.authors="The Prometheus Authors" \
org.opencontainers.image.vendor="Prometheus" \
org.opencontainers.image.title="Prometheus" \
org.opencontainers.image.description="The Prometheus monitoring system and time series database" \
org.opencontainers.image.source="https://github.com/prometheus/prometheus" \
org.opencontainers.image.url="https://github.com/prometheus/prometheus" \
org.opencontainers.image.documentation="https://prometheus.io/docs" \
org.opencontainers.image.licenses="Apache License 2.0"

ARG ARCH="amd64"
ARG OS="linux"
COPY .build/${OS}-${ARCH}/prometheus /bin/prometheus
COPY .build/${OS}-${ARCH}/promtool /bin/promtool
COPY documentation/examples/prometheus.yml /etc/prometheus/prometheus.yml
COPY LICENSE /LICENSE
COPY NOTICE /NOTICE
COPY npm_licenses.tar.bz2 /npm_licenses.tar.bz2

WORKDIR /prometheus
RUN chown -R nobody:nobody /etc/prometheus /prometheus && chmod g+w /prometheus

USER nobody
EXPOSE 9090
VOLUME [ "/prometheus" ]
ENTRYPOINT [ "/bin/prometheus" ]
CMD [ "--config.file=/etc/prometheus/prometheus.yml", \
"--storage.tsdb.path=/prometheus" ]
7 changes: 7 additions & 0 deletions src/somef/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,3 +454,10 @@ class RepositoryType(Enum):
CAT_CODEMETA_SOFTWAREREQUIREMENTS = "softwareRequirements"
CAT_CODEMETA_SOFTWAREVERSION = "softwareVersion"
CAT_CODEMETA_URL = "url"


# DOCKER labels maintainer
# REGEXP_MAINTAINER_LABEL_OCI = r'^\s*LABEL\s+org\.opencontainers\.image\.authors\s*=\s*["\']?(.+?)["\']?\s*$'
REGEXP_MAINTAINER_LABEL_OCI = r'^\s*LABEL\s+org\.opencontainers\.image\.authors\s*=\s*["\']([^"\'\\]+)["\']?\s*(?:\\)?\s*$'
REGEXP_MAINTAINER_LABEL_FREE = r'^\s*LABEL\s+"?maintainer"?\s*=\s*["\']?(.+?)["\']?\s*$'
REGEXP_MAINTAINER = r'^\s*MAINTAINER\s+(.+)$'