From d239634af9bdda7629d98f68b3231c759682888a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Mon, 14 Apr 2025 12:28:48 +0300
Subject: [PATCH] Fix bug with unclosed bold and italics in succession

Fixes wiktextract issue #1120 and others

What was broken: if you had text like `aaa '''bolded''`
with a typo that makes the formatters unbalanced,
the earlier token would eat the rest of the article.

Issue was fixed by adding `continue` into text_fn() in
parser.py, in the `while True` block that would loop
over the current parser stack in reverse and pop its
items, to handle end-of-line breakpoints for those items.

The `elif` entry for italics and bolds were missing a continue,
and so the `break` after the `if` block would execute,
meaning the bold token was never popped and parsing
continued with everything following being its children.
---
 src/wikitextprocessor/parser.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py
index a297331c..5703b957 100644
--- a/src/wikitextprocessor/parser.py
+++ b/src/wikitextprocessor/parser.py
@@ -967,6 +967,7 @@ def text_fn(ctx: "Wtp", token: str) -> None:
                     sortid="parser/449",
                 )
                 _parser_pop(ctx, False)
+                continue
             break
 
         # Spaces at the beginning of a line indicate preformatted text
@@ -2241,9 +2242,9 @@ def token_iter(ctx: "Wtp", text: str) -> Iterator[tuple[bool, str]]:
                 # the length is longer than the end token was.
                 yield True, ">" + start
             continue
-        # Partition on '', so that we can detect bold/italics
+        # Partition on ''+, so that we can detect bold/italics
         parts = re.split(parts_re, line)
-        state = 0  # 1=in italic 2=in bold 3=in both
+        state = 0  # 1=in italic, 2=in bold, 3=in both
         for i, part in enumerate(parts):
             if part.startswith("''"):
                 # This is a bold/italic part.  Scan the rest of the line