From d239634af9bdda7629d98f68b3231c759682888a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Mon, 14 Apr 2025 12:28:48 +0300 Subject: [PATCH] Fix bug with unclosed bold and italics in succession Fixes wiktextract issue #1120 and others What was broken: if you had text like `aaa '''bolded''` with a typo that makes the formatters unbalanced, the earlier token would eat the rest of the article. Issue was fixed by adding `continue` into text_fn() in parser.py, in the `while True` block that would loop over the current parser stack in reverse and pop its items, to handle end-of-line breakpoints for those items. The `elif` entry for italics and bolds were missing a continue, and so the `break` after the `if` block would execute, meaning the bold token was never popped and parsing continued with everything following being its children. --- src/wikitextprocessor/parser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py index a297331c..5703b957 100644 --- a/src/wikitextprocessor/parser.py +++ b/src/wikitextprocessor/parser.py @@ -967,6 +967,7 @@ def text_fn(ctx: "Wtp", token: str) -> None: sortid="parser/449", ) _parser_pop(ctx, False) + continue break # Spaces at the beginning of a line indicate preformatted text @@ -2241,9 +2242,9 @@ def token_iter(ctx: "Wtp", text: str) -> Iterator[tuple[bool, str]]: # the length is longer than the end token was. yield True, ">" + start continue - # Partition on '', so that we can detect bold/italics + # Partition on ''+, so that we can detect bold/italics parts = re.split(parts_re, line) - state = 0 # 1=in italic 2=in bold 3=in both + state = 0 # 1=in italic, 2=in bold, 3=in both for i, part in enumerate(parts): if part.startswith("''"): # This is a bold/italic part. Scan the rest of the line