Skip to content

Commit 45dd347

Browse files
authored
Merge pull request #124 from allenhutchison/fix-iso8601-dates
Add support for ISO 8601 dates in OPML files
2 parents e2eaf2a + 971d511 commit 45dd347

File tree

6 files changed

+213
-4
lines changed

6 files changed

+213
-4
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Changed
2+
-------
3+
4+
* Try to parse OPML timestamps using RFC 3339 format
5+
if the timestamp isn't in RFC 822 format. (#123)
6+
7+
Thanks to Allen Hutchinson for this improvement!

src/listparser/dates.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from __future__ import annotations
77

88
import datetime
9+
import re
910

1011
months: dict[str, int] = {
1112
"jan": 1,
@@ -171,3 +172,66 @@ def parse_rfc822(date: str) -> datetime.datetime | None:
171172
)
172173
except (ValueError, OverflowError):
173174
return None
175+
176+
177+
_rfc3339_pattern = re.compile(
178+
r"""
179+
^
180+
(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})
181+
[T ]
182+
(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})
183+
(?:\.(?P<microsecond>\d+))?
184+
(?P<timezone>Z|[+-]\d{2}:?\d{2})?
185+
$
186+
""",
187+
flags=re.VERBOSE | re.ASCII,
188+
)
189+
190+
191+
def parse_rfc3339(date: str) -> datetime.datetime | None:
192+
"""Parse RFC 3339 dates and times.
193+
194+
`datetime.datetime.fromisoformat()` can be used
195+
once Python 3.10 and lower are no longer supported.
196+
"""
197+
198+
match = _rfc3339_pattern.match(date)
199+
if not match:
200+
return None
201+
202+
year = int(match.group("year"))
203+
month = int(match.group("month"))
204+
day = int(match.group("day"))
205+
hour = int(match.group("hour"))
206+
minute = int(match.group("minute"))
207+
second = int(match.group("second"))
208+
209+
# Handle microseconds (if present).
210+
microsecond = 0
211+
if fractional := match.group("microsecond"):
212+
# Pad or truncate to 6 digits for microseconds.
213+
fractional = fractional.ljust(6, "0")[:6]
214+
microsecond = int(fractional)
215+
216+
tzinfo = None
217+
if timezone := match.group("timezone"):
218+
if timezone == "Z":
219+
tzinfo = datetime.timezone.utc
220+
else:
221+
# Handle +HH:MM and -HH:MM format.
222+
sign = 1 if timezone[0] == "+" else -1
223+
tz_hour = int(timezone[1:3])
224+
tz_minute = int(timezone[-2:])
225+
offset = datetime.timedelta(minutes=sign * ((tz_hour * 60) + tz_minute))
226+
try:
227+
tzinfo = datetime.timezone(offset)
228+
except ValueError:
229+
return None
230+
231+
# Create datetime object
232+
try:
233+
return datetime.datetime(
234+
year, month, day, hour, minute, second, microsecond, tzinfo=tzinfo
235+
)
236+
except (ValueError, OverflowError):
237+
return None

src/listparser/opml.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,20 +115,20 @@ def end_opml_datecreated(self) -> None:
115115
value = self.get_text()
116116
if value:
117117
self.harvest["meta"]["created"] = value
118-
timestamp = dates.parse_rfc822(value)
118+
timestamp = dates.parse_rfc822(value) or dates.parse_rfc3339(value)
119119
if timestamp:
120120
self.harvest["meta"]["created_parsed"] = timestamp
121121
else:
122-
self.raise_bozo("dateCreated is not an RFC 822 datetime")
122+
self.raise_bozo("dateCreated is not a recognized datetime format")
123123

124124
start_opml_datemodified = common.Common.expect_text
125125

126126
def end_opml_datemodified(self) -> None:
127127
value = self.get_text()
128128
if value:
129129
self.harvest["meta"]["modified"] = value
130-
timestamp = dates.parse_rfc822(value)
130+
timestamp = dates.parse_rfc822(value) or dates.parse_rfc3339(value)
131131
if timestamp:
132132
self.harvest["meta"]["modified_parsed"] = timestamp
133133
else:
134-
self.raise_bozo("dateModified is not an RFC 822 datetime")
134+
self.raise_bozo("dateModified is not a recognized datetime format")
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<?xml version="1.0"?>
2+
<!--
3+
Description: /opml/head/dateCreated RFC3339
4+
Eval: result['bozo'] == 0
5+
Eval: result['meta']['created'] == '2025-01-02T03:04:05Z'
6+
Eval: result['meta']['created_parsed'] == datetime.datetime(2025, 1, 2, 3, 4, 5, tzinfo=datetime.timezone.utc)
7+
-->
8+
<opml version="2.0">
9+
<head>
10+
<dateCreated>2025-01-02T03:04:05Z</dateCreated>
11+
</head>
12+
<body>
13+
<outline text="node" />
14+
</body>
15+
</opml>
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<?xml version="1.0"?>
2+
<!--
3+
Description: /opml/head/dateModified RFC3339
4+
Eval: result['bozo'] == 0
5+
Eval: result['meta']['modified'] == '2025-01-02T03:04:05Z'
6+
Eval: result['meta']['modified_parsed'] == datetime.datetime(2025, 1, 2, 3, 4, 5, tzinfo=datetime.timezone.utc)
7+
-->
8+
<opml version="2.0">
9+
<head>
10+
<dateModified>2025-01-02T03:04:05Z</dateModified>
11+
</head>
12+
<body>
13+
<outline text="node" />
14+
</body>
15+
</opml>

tests/test_dates.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,111 @@ def test_timezones(date, hour, minute, offset):
136136
)
137137
def test_invalid_dates(date):
138138
assert listparser.dates.parse_rfc822(date) is None
139+
140+
141+
@pytest.mark.parametrize(
142+
"date, expected",
143+
(
144+
pytest.param(
145+
"2025-04-07T19:52:30Z",
146+
datetime.datetime(2025, 4, 7, 19, 52, 30, tzinfo=datetime.timezone.utc),
147+
id="simple",
148+
),
149+
pytest.param(
150+
"2025-04-07T19:52:30.941Z",
151+
datetime.datetime(
152+
2025, 4, 7, 19, 52, 30, 941000, tzinfo=datetime.timezone.utc
153+
),
154+
id="milliseconds",
155+
),
156+
pytest.param(
157+
"2025-04-07T19:52:30.941378073Z",
158+
datetime.datetime(
159+
2025, 4, 7, 19, 52, 30, 941378, tzinfo=datetime.timezone.utc
160+
),
161+
id="truncated milliseconds",
162+
),
163+
pytest.param(
164+
"2025-04-07T19:52:30+02:00",
165+
datetime.datetime(
166+
2025,
167+
4,
168+
7,
169+
19,
170+
52,
171+
30,
172+
tzinfo=datetime.timezone(datetime.timedelta(hours=2)),
173+
),
174+
id="timezone offset with colon",
175+
),
176+
pytest.param(
177+
"2025-04-07T19:52:30-0700",
178+
datetime.datetime(
179+
2025,
180+
4,
181+
7,
182+
19,
183+
52,
184+
30,
185+
tzinfo=datetime.timezone(datetime.timedelta(hours=-7)),
186+
),
187+
id="timezone offset without colon",
188+
),
189+
pytest.param(
190+
"2025-01-01T00:00:00",
191+
datetime.datetime(2025, 1, 1, 0, 0, 0),
192+
id="no timezone",
193+
),
194+
pytest.param(
195+
"2025-04-07 19:52:30Z",
196+
datetime.datetime(2025, 4, 7, 19, 52, 30, tzinfo=datetime.timezone.utc),
197+
id="space instead of 'T'",
198+
),
199+
pytest.param(
200+
"0001-01-01T00:00:00+23:59",
201+
datetime.datetime(
202+
1,
203+
1,
204+
1,
205+
0,
206+
0,
207+
0,
208+
tzinfo=datetime.timezone(datetime.timedelta(hours=23, minutes=59)),
209+
),
210+
id="earliest possible date",
211+
),
212+
pytest.param(
213+
"9999-12-31T23:59:59-23:59",
214+
datetime.datetime(
215+
9999,
216+
12,
217+
31,
218+
23,
219+
59,
220+
59,
221+
tzinfo=datetime.timezone(-datetime.timedelta(hours=23, minutes=59)),
222+
),
223+
id="latest possible date",
224+
),
225+
),
226+
)
227+
def test_rfc3339_dates(date: str, expected: datetime.datetime):
228+
assert listparser.dates.parse_rfc3339(date) == expected
229+
230+
231+
@pytest.mark.parametrize(
232+
"date",
233+
(
234+
pytest.param("0000-01-01T01:01:01Z", id="invalid year"),
235+
pytest.param("2025-99-01T01:01:01Z", id="invalid month"),
236+
pytest.param("2025-01-99T01:01:01Z", id="invalid day"),
237+
pytest.param("2025-01-01T99:01:01Z", id="invalid hour"),
238+
pytest.param("2025-01-01T01:99:01Z", id="invalid minute"),
239+
pytest.param("2025-01-01T01:01:99Z", id="invalid seconds"),
240+
pytest.param("2025-01-01T01:01:01+99:00", id="invalid timezone offset hours"),
241+
pytest.param("2025-01-01TT01:01:01Z", id="invalid separator"),
242+
pytest.param("tomorrow-ish", id="invalid format"),
243+
),
244+
)
245+
def test_rfc3339_invalid_dates(date: str):
246+
assert listparser.dates.parse_rfc3339(date) is None

0 commit comments

Comments
 (0)