Skip to content

Commit 9bb07e5

Browse files
Praneel7015agriyakhetarpalidlip
authored
feat: add markdown to JSON parser enhancements (label normalization + schema validation) (#195)
* Add JSON schema for maintainer validation * feat: add JSON parser Enhancements * Update maintainer.schema.json Co-authored-by: Agriya Khetarpal <74401230+agriyakhetarpal@users.noreply.github.com> * Fix casing for 'Bluesky' in parse-maintainer.py * Update validate_maintainers.py Co-authored-by: Agriya Khetarpal <74401230+agriyakhetarpal@users.noreply.github.com> * Update validate_maintainers.py Co-authored-by: Agriya Khetarpal <74401230+agriyakhetarpal@users.noreply.github.com> * Update validate_maintainers.py Co-authored-by: Agriya Khetarpal <74401230+agriyakhetarpal@users.noreply.github.com> * Delete validate_maintainers.py --------- Co-authored-by: Agriya Khetarpal <74401230+agriyakhetarpal@users.noreply.github.com> Co-authored-by: Dilip <117019901+idlip@users.noreply.github.com>
1 parent 53785e5 commit 9bb07e5

File tree

3 files changed

+262
-3
lines changed

3 files changed

+262
-3
lines changed

.pre-commit-config.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# See https://pre-commit.com for more information
2+
# Install: pip install pre-commit && pre-commit install
3+
repos:
4+
- repo: https://github.com/python-jsonschema/check-jsonschema
5+
rev: 0.29.4
6+
hooks:
7+
- id: check-jsonschema
8+
name: Validate maintainer JSON files
9+
files: ^content/maintainers/.*\.json$
10+
args: ["--schemafile", "maintainer.schema.json"]

maintainer.schema.json

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
{
2+
"$schema": "http://json-schema.org/draft-07/schema#",
3+
"$id": "https://forklore.org/maintainer.schema.json",
4+
"title": "Maintainer",
5+
"description": "Schema for validating maintainer JSON files",
6+
"type": "object",
7+
"required": ["username", "full_name", "photo", "socials", "projects", "form"],
8+
"additionalProperties": false,
9+
"properties": {
10+
"username": {
11+
"type": "string",
12+
"description": "Unique identifier for the maintainer",
13+
"minLength": 1
14+
},
15+
"full_name": {
16+
"type": "string",
17+
"description": "Full name of the maintainer",
18+
"minLength": 1
19+
},
20+
"photo": {
21+
"type": "string",
22+
"description": "URL to the maintainer's photo",
23+
"format": "uri"
24+
},
25+
"designation": {
26+
"type": "string",
27+
"description": "Job title or role description"
28+
},
29+
"socials": {
30+
"type": "array",
31+
"description": "List of social media links",
32+
"items": {
33+
"type": "object",
34+
"required": ["label", "link"],
35+
"additionalProperties": false,
36+
"properties": {
37+
"label": {
38+
"type": "string",
39+
"description": "Social media platform name",
40+
"enum": [
41+
"GitHub",
42+
"GitLab",
43+
"Gitlab",
44+
"Codeberg",
45+
"BitBucket",
46+
"LinkedIn",
47+
"X",
48+
"Twitter",
49+
"Mastodon",
50+
"Bluesky",
51+
"Substack",
52+
"Discourse",
53+
"Email",
54+
"RSS",
55+
"Web"
56+
]
57+
},
58+
"link": {
59+
"type": "string",
60+
"description": "URL to the social media profile",
61+
"format": "uri"
62+
}
63+
}
64+
}
65+
},
66+
"projects": {
67+
"type": "array",
68+
"description": "List of projects maintained",
69+
"items": {
70+
"type": "object",
71+
"required": ["name", "project_link", "description", "short_description"],
72+
"additionalProperties": false,
73+
"properties": {
74+
"name": {
75+
"type": "string",
76+
"description": "Name of the project",
77+
"minLength": 1
78+
},
79+
"project_link": {
80+
"type": "string",
81+
"description": "URL to the project repository",
82+
"format": "uri"
83+
},
84+
"website_link": {
85+
"type": "string",
86+
"description": "URL to the project website",
87+
"format": "uri"
88+
},
89+
"logo": {
90+
"type": "string",
91+
"description": "URL to the project logo",
92+
"format": "uri"
93+
},
94+
"description": {
95+
"type": "string",
96+
"description": "Full description of the project"
97+
},
98+
"short_description": {
99+
"type": "string",
100+
"description": "Short description used in cards and meta"
101+
}
102+
}
103+
}
104+
},
105+
"form": {
106+
"type": "array",
107+
"description": "Q&A form responses",
108+
"items": {
109+
"type": "object",
110+
"required": ["question", "response"],
111+
"additionalProperties": false,
112+
"properties": {
113+
"question": {
114+
"type": "string",
115+
"description": "The question text"
116+
},
117+
"response": {
118+
"type": "string",
119+
"description": "The maintainer's response"
120+
}
121+
}
122+
}
123+
},
124+
"created_on": {
125+
"type": "string",
126+
"description": "ISO 8601 timestamp of when the entry was created",
127+
"format": "date-time"
128+
}
129+
}
130+
}

parse-maintainer.py

Lines changed: 122 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,93 @@
44
import json
55
import re
66
from datetime import datetime
7+
from pathlib import Path
8+
9+
try:
10+
import jsonschema
11+
HAS_JSONSCHEMA = True
12+
except ImportError:
13+
HAS_JSONSCHEMA = False
14+
15+
16+
# Mapping of user input labels to schema-valid labels
17+
LABEL_NORMALIZATION = {
18+
# Lowercase variants
19+
"github": "GitHub",
20+
"gitlab": "GitLab",
21+
"codeberg": "Codeberg",
22+
"bitbucket": "BitBucket",
23+
"linkedin": "LinkedIn",
24+
"mastodon": "Mastodon",
25+
"bluesky": "Bluesky",
26+
"substack": "Substack",
27+
"discourse": "Discourse",
28+
"twitter": "Twitter",
29+
"email": "Email",
30+
"rss": "RSS",
31+
"web": "Web",
32+
"x": "X",
33+
# Mixed case variants
34+
"Github": "GitHub",
35+
"Gitlab": "GitLab",
36+
"Linkedin": "LinkedIn",
37+
"Bluesky": "Bluesky",
38+
"Bitbucket": "BitBucket",
39+
# Common aliases
40+
"Website": "Web",
41+
"website": "Web",
42+
"Blog": "Web",
43+
"blog": "Web",
44+
"Mail": "Email",
45+
"mail": "Email",
46+
"X/Twitter": "X",
47+
"Twitter/X": "X",
48+
}
49+
50+
# Valid labels as per schema
51+
VALID_LABELS = {
52+
"GitHub", "GitLab", "Gitlab", "Codeberg", "BitBucket", "LinkedIn",
53+
"X", "Twitter", "Mastodon", "Bluesky", "Substack",
54+
"Discourse", "Email", "RSS", "Web"
55+
}
56+
57+
58+
def normalize_label(label: str) -> str:
59+
"""Normalize a social media label to match schema requirements."""
60+
label = label.strip()
61+
62+
# Check if already valid
63+
if label in VALID_LABELS:
64+
return label
65+
66+
# Try normalization map
67+
if label in LABEL_NORMALIZATION:
68+
return LABEL_NORMALIZATION[label]
69+
70+
# Return as-is (will fail validation, but user will see error)
71+
return label
72+
73+
74+
def load_schema():
75+
"""Load the JSON schema for validation."""
76+
schema_path = Path(__file__).parent / "maintainer.schema.json"
77+
if not schema_path.exists():
78+
return None
79+
with open(schema_path, encoding="utf-8") as f:
80+
return json.load(f)
81+
82+
83+
def validate_data(data: dict, schema: dict) -> list[str]:
84+
"""Validate data against schema. Returns list of errors."""
85+
if not HAS_JSONSCHEMA:
86+
return []
87+
88+
errors = []
89+
validator = jsonschema.Draft7Validator(schema, format_checker=jsonschema.FormatChecker())
90+
for error in validator.iter_errors(data):
91+
path = ".".join(str(p) for p in error.absolute_path) if error.absolute_path else "(root)"
92+
errors.append(f" {path}: {error.message}")
93+
return errors
794

895
def parse_issue(md):
996
# Remove HTML comments
@@ -34,8 +121,12 @@ def parse_issue(md):
34121
if ':' in line:
35122
line = line.lstrip('- ').strip()
36123
label, link = line.split(':', 1)
124+
# Normalize the label to match schema requirements
125+
normalized_label = normalize_label(label.strip())
126+
if normalized_label not in VALID_LABELS:
127+
print(f"Warning: Unknown or invalid social label '{label.strip()}' (normalized: '{normalized_label}')", file=sys.stderr)
37128
data['socials'].append({
38-
"label": label.strip(),
129+
"label": normalized_label,
39130
"link": link.strip()
40131
})
41132

@@ -89,10 +180,19 @@ def parse_issue(md):
89180

90181
if __name__ == "__main__":
91182
if len(sys.argv) < 2:
92-
print("Usage: python parse_maintainer.py <input_file.md>")
183+
print("Usage: python parse_maintainer.py <input_file.md> [--validate]")
184+
print("Options:")
185+
print(" --validate Validate output against JSON schema")
93186
sys.exit(1)
94187

95-
with open(sys.argv[1], 'r', encoding='utf-8') as f:
188+
args = sys.argv[1:]
189+
validate_mode = "--validate" in args
190+
if validate_mode:
191+
args.remove("--validate")
192+
193+
input_file = args[0]
194+
195+
with open(input_file, 'r', encoding='utf-8') as f:
96196
result = parse_issue(f.read())
97197

98198
# Output JSON
@@ -104,5 +204,24 @@ def parse_issue(md):
104204
output_file = f"{username}.json"
105205
with open(output_file, 'w', encoding='utf-8') as f:
106206
f.write(json_output)
207+
f.write("\n")
107208

108209
print(f"\nSaved to {output_file}", file=sys.stderr)
210+
211+
# Validate against schema if requested
212+
if validate_mode:
213+
schema = load_schema()
214+
if schema is None:
215+
print("Schema file not found, skipping validation", file=sys.stderr)
216+
elif not HAS_JSONSCHEMA:
217+
print("jsonschema not installed, skipping validation", file=sys.stderr)
218+
print("Install with: pip install jsonschema", file=sys.stderr)
219+
else:
220+
errors = validate_data(result, schema)
221+
if errors:
222+
print(f"\nValidation failed:", file=sys.stderr)
223+
for error in errors:
224+
print(error, file=sys.stderr)
225+
sys.exit(1)
226+
else:
227+
print("Validation passed", file=sys.stderr)

0 commit comments

Comments
 (0)