|
2 | 2 | from concurrent.futures import ThreadPoolExecutor |
3 | 3 | from typing import Dict, List, Optional |
4 | 4 | from app.core.interfaces import WikipediaClientInterface, CacheServiceInterface |
5 | | -from app.utils.exceptions import WikipediaAPIError, WikipediaPageNotFoundError |
| 5 | +from app.utils.exceptions import WikipediaAPIError |
6 | 6 | from app.utils.logging import get_logger |
7 | 7 |
|
8 | 8 | logger = get_logger(__name__) |
@@ -197,6 +197,81 @@ def page_exists(self, page_title: str) -> bool: |
197 | 197 | logger.error(f"Failed to check page existence for {page_title}: {e}") |
198 | 198 | return False |
199 | 199 |
|
| 200 | + def get_page_with_redirect_info(self, page_title: str) -> Optional[dict]: |
| 201 | + """ |
| 202 | + Get page information including redirect details. |
| 203 | +
|
| 204 | + Args: |
| 205 | + page_title: Wikipedia page title |
| 206 | +
|
| 207 | + Returns: |
| 208 | + Dict with 'exists', 'final_title', 'was_redirected', 'is_disambiguation' |
| 209 | + """ |
| 210 | + params = { |
| 211 | + "action": "query", |
| 212 | + "format": "json", |
| 213 | + "titles": page_title, |
| 214 | + "prop": "info|categories", |
| 215 | + "redirects": 1, |
| 216 | + } |
| 217 | + |
| 218 | + try: |
| 219 | + response = self.session.get(self.base_url, params=params, timeout=10) |
| 220 | + response.raise_for_status() |
| 221 | + data = response.json().get("query", {}) |
| 222 | + |
| 223 | + # Check for redirects |
| 224 | + redirects = data.get("redirects", []) |
| 225 | + was_redirected = len(redirects) > 0 |
| 226 | + final_title = page_title |
| 227 | + |
| 228 | + if was_redirected: |
| 229 | + # Find the final redirect target |
| 230 | + for redirect in redirects: |
| 231 | + if redirect.get("from") == page_title: |
| 232 | + final_title = redirect.get("to", page_title) |
| 233 | + break |
| 234 | + |
| 235 | + # Check if page exists |
| 236 | + pages = data.get("pages", {}) |
| 237 | + page_exists = False |
| 238 | + is_disambiguation = False |
| 239 | + |
| 240 | + for page_data in pages.values(): |
| 241 | + if "missing" not in page_data: |
| 242 | + page_exists = True |
| 243 | + current_title = page_data.get("title", "") |
| 244 | + |
| 245 | + # Check if it's a disambiguation page |
| 246 | + if "(disambiguation)" in current_title.lower(): |
| 247 | + is_disambiguation = True |
| 248 | + else: |
| 249 | + # Check categories for disambiguation |
| 250 | + categories = page_data.get("categories", []) |
| 251 | + for category in categories: |
| 252 | + cat_title = category.get("title", "").lower() |
| 253 | + if "disambiguation" in cat_title: |
| 254 | + is_disambiguation = True |
| 255 | + break |
| 256 | + |
| 257 | + return { |
| 258 | + "exists": page_exists, |
| 259 | + "final_title": final_title, |
| 260 | + "was_redirected": was_redirected, |
| 261 | + "is_disambiguation": is_disambiguation, |
| 262 | + "original_title": page_title, |
| 263 | + } |
| 264 | + |
| 265 | + except requests.RequestException as e: |
| 266 | + logger.error(f"Failed to get page redirect info for {page_title}: {e}") |
| 267 | + return { |
| 268 | + "exists": False, |
| 269 | + "final_title": page_title, |
| 270 | + "was_redirected": False, |
| 271 | + "is_disambiguation": False, |
| 272 | + "original_title": page_title, |
| 273 | + } |
| 274 | + |
200 | 275 | def get_page_info(self, page_title: str) -> Optional[dict]: |
201 | 276 | """ |
202 | 277 | Get basic information about a Wikipedia page. |
|
0 commit comments