diff --git a/addon/globalPlugins/visionAssistant/__init__.py b/addon/globalPlugins/visionAssistant/__init__.py index 089a7b4..702d21b 100644 --- a/addon/globalPlugins/visionAssistant/__init__.py +++ b/addon/globalPlugins/visionAssistant/__init__.py @@ -49,654 +49,36 @@ log = logging.getLogger(__name__) addonHandler.initTranslation() -_vision_assistant_instance = None - -ADDON_NAME = addonHandler.getCodeAddon().manifest["summary"] -GITHUB_REPO = "mahmoodhozhabri/VisionAssistantPro" - -# --- Constants & Config --- - -CHROME_OCR_KEYS = [ - "AIzaSyA2KlwBX3mkFo30om9LUFYQhpqLoa_BNhE", - "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw" -] - -MODELS = [ - # --- 1. Recommended (Auto-Updating) --- - # Translators: AI Model info. [Auto] = Automatic updates. (Latest) = Newest version. - (_("[Auto]") + " Gemini Flash " + _("(Latest)"), "gemini-flash-latest"), - (_("[Auto]") + " Gemini Flash Lite " + _("(Latest)"), "gemini-flash-lite-latest"), - - # --- 2. Current Standard (Free & Fast) --- - # Translators: AI Model info. [Free] = Generous usage limits. (Preview) = Experimental or early-access version. - (_("[Free]") + " Gemini 3.0 Flash " + _("(Preview)"), "gemini-3-flash-preview"), - (_("[Free]") + " Gemini 2.5 Flash", "gemini-2.5-flash"), - (_("[Free]") + " Gemini 2.5 Flash Lite", "gemini-2.5-flash-lite"), - - # --- 3. High Intelligence (Paid/Pro/Preview) --- - # Translators: AI Model info. [Pro] = High intelligence/Paid tier. (Preview) = Experimental version. - (_("[Pro]") + " Gemini 3.0 Pro " + _("(Preview)"), "gemini-3-pro-preview"), - (_("[Pro]") + " Gemini 2.5 Pro", "gemini-2.5-pro"), -] - -GEMINI_VOICES = [ - # Translators: Adjective describing a bright AI voice style. - ("Zephyr", _("Bright")), - # Translators: Adjective describing an upbeat AI voice style. - ("Puck", _("Upbeat")), - # Translators: Adjective describing an informative AI voice style. - ("Charon", _("Informative")), - # Translators: Adjective describing a firm AI voice style. - ("Kore", _("Firm")), - # Translators: Adjective describing an excitable AI voice style. - ("Fenrir", _("Excitable")), - # Translators: Adjective describing a youthful AI voice style. - ("Leda", _("Youthful")), - # Translators: Adjective describing a firm AI voice style. - ("Orus", _("Firm")), - # Translators: Adjective describing a breezy AI voice style. - ("Aoede", _("Breezy")), - # Translators: Adjective describing an easy-going AI voice style. - ("Callirrhoe", _("Easy-going")), - # Translators: Adjective describing a bright AI voice style. - ("Autonoe", _("Bright")), - # Translators: Adjective describing a breathy AI voice style. - ("Enceladus", _("Breathy")), - # Translators: Adjective describing a clear AI voice style. - ("Iapetus", _("Clear")), - # Translators: Adjective describing an easy-going AI voice style. - ("Umbriel", _("Easy-going")), - # Translators: Adjective describing a smooth AI voice style. - ("Algieba", _("Smooth")), - # Translators: Adjective describing a smooth AI voice style. - ("Despina", _("Smooth")), - # Translators: Adjective describing a clear AI voice style. - ("Erinome", _("Clear")), - # Translators: Adjective describing a gravelly AI voice style. - ("Algenib", _("Gravelly")), - # Translators: Adjective describing an informative AI voice style. - ("Rasalgethi", _("Informative")), - # Translators: Adjective describing an upbeat AI voice style. - ("Laomedeia", _("Upbeat")), - # Translators: Adjective describing a soft AI voice style. - ("Achernar", _("Soft")), - # Translators: Adjective describing a firm AI voice style. - ("Alnilam", _("Firm")), - # Translators: Adjective describing an even AI voice style. - ("Schedar", _("Even")), - # Translators: Adjective describing a mature AI voice style. - ("Gacrux", _("Mature")), - # Translators: Adjective describing a forward AI voice style. - ("Pulcherrima", _("Forward")), - # Translators: Adjective describing a friendly AI voice style. - ("Achird", _("Friendly")), - # Translators: Adjective describing a casual AI voice style. - ("Zubenelgenubi", _("Casual")), - # Translators: Adjective describing a gentle AI voice style. - ("Vindemiatrix", _("Gentle")), - # Translators: Adjective describing a lively AI voice style. - ("Sadachbia", _("Lively")), - # Translators: Adjective describing a knowledgeable AI voice style. - ("Sadaltager", _("Knowledgeable")), - # Translators: Adjective describing a warm AI voice style. - ("Sulafat", _("Warm")) -] - -BASE_LANGUAGES = [ - ("Arabic", "ar"), ("Bulgarian", "bg"), ("Chinese", "zh"), ("Czech", "cs"), ("Danish", "da"), - ("Dutch", "nl"), ("English", "en"), ("Finnish", "fi"), ("French", "fr"), - ("German", "de"), ("Greek", "el"), ("Hebrew", "he"), ("Hindi", "hi"), - ("Hungarian", "hu"), ("Indonesian", "id"), ("Italian", "it"), ("Japanese", "ja"), - ("Korean", "ko"), ("Nepali", "ne"), ("Norwegian", "no"), ("Persian", "fa"), ("Polish", "pl"), - ("Portuguese", "pt"), ("Romanian", "ro"), ("Russian", "ru"), ("Spanish", "es"), - ("Swedish", "sv"), ("Thai", "th"), ("Turkish", "tr"), ("Ukrainian", "uk"), - ("Vietnamese", "vi") -] -SOURCE_LIST = [("Auto-detect", "auto")] + BASE_LANGUAGES -SOURCE_NAMES = [x[0] for x in SOURCE_LIST] -TARGET_LIST = BASE_LANGUAGES -TARGET_NAMES = [x[0] for x in TARGET_LIST] -TARGET_CODES = {x[0]: x[1] for x in BASE_LANGUAGES} - -OCR_ENGINES = [ - # Translators: OCR Engine option (Fast but less formatted) - (_("Chrome (Fast)"), "chrome"), - # Translators: OCR Engine option (Slower but better formatting) - (_("Gemini (Formatted)"), "gemini") -] - -confspec = { - "proxy_url": "string(default='')", - "api_key": "string(default='')", - "model_name": "string(default='gemini-flash-lite-latest')", - "target_language": "string(default='English')", - "source_language": "string(default='Auto-detect')", - "ai_response_language": "string(default='English')", - "smart_swap": "boolean(default=True)", - "captcha_mode": "string(default='navigator')", - "custom_prompts": "string(default='')", - "custom_prompts_v2": "string(default='')", - "default_refine_prompts": "string(default='')", - "check_update_startup": "boolean(default=False)", - "clean_markdown_chat": "boolean(default=True)", - "copy_to_clipboard": "boolean(default=False)", - "skip_chat_dialog": "boolean(default=False)", - "ocr_engine": "string(default='chrome')", - "tts_voice": "string(default='Puck')" -} - -config.conf.spec["VisionAssistant"] = confspec - -PROMPT_TRANSLATE = """ -Task: Translate the text below to "{target_lang}". - -Configuration: -- Target Language: "{target_lang}" -- Swap Language: "{swap_target}" -- Smart Swap: {smart_swap} - -Rules: -1. DEFAULT: Translate the input strictly to "{target_lang}". -2. MIXED CONTENT: If the text contains mixed languages (e.g., Arabic content with English UI terms like 'Reply', 'From', 'Forwarded'), translate EVERYTHING to "{target_lang}". -3. EXCEPTION: If (and ONLY if) the input is already completely in "{target_lang}" AND "Smart Swap" is True, then translate to "{swap_target}". - -Constraints: -- Output ONLY the translation. -- Do NOT translate actual programming code (Python, C++, etc.) or URLs. -- Translate ALL UI elements, menus, and interface labels. - -Input Text: -{text_content} -""" - -PROMPT_UI_LOCATOR = "Analyze UI (Size: {width}x{height}). Request: '{query}'. Output JSON: {{\"x\": int, \"y\": int, \"found\": bool}}." - -REFINE_PROMPT_KEYS = ("summarize", "fix_grammar", "fix_translate", "explain") - -LEGACY_REFINER_TOKENS = { - "summarize": "[summarize]", - "fix_grammar": "[fix_grammar]", - "fix_translate": "[fix_translate]", - "explain": "[explain]", -} - -DEFAULT_SYSTEM_PROMPTS = ( - { - "key": "summarize", - # Translators: Section header for text refinement prompts in Prompt Manager. - "section": _("Refine"), - # Translators: Label for the text summarization prompt. - "label": _("Summarize"), - "prompt": "Summarize the text below in {response_lang}.", - }, - { - "key": "fix_grammar", - # Translators: Section header for text refinement prompts in Prompt Manager. - "section": _("Refine"), - # Translators: Label for the grammar correction prompt. - "label": _("Fix Grammar"), - "prompt": "Fix grammar in the text below. Output ONLY the fixed text.", - }, - { - "key": "fix_translate", - # Translators: Section header for text refinement prompts in Prompt Manager. - "section": _("Refine"), - # Translators: Label for the grammar correction and translation prompt. - "label": _("Fix Grammar & Translate"), - "prompt": "Fix grammar and translate to {target_lang}.{swap_instruction} Output ONLY the result.", - }, - { - "key": "explain", - # Translators: Section header for text refinement prompts in Prompt Manager. - "section": _("Refine"), - # Translators: Label for the text explanation prompt. - "label": _("Explain"), - "prompt": "Explain the text below in {response_lang}.", - }, - { - "key": "translate_main", - # Translators: Section header for translation-related prompts in Prompt Manager. - "section": _("Translation"), - # Translators: Label for the smart translation prompt. - "label": _("Smart Translation"), - "prompt": PROMPT_TRANSLATE.strip(), - }, - { - "key": "translate_quick", - # Translators: Section header for translation-related prompts in Prompt Manager. - "section": _("Translation"), - # Translators: Label for the quick translation prompt. - "label": _("Quick Translation"), - "prompt": "Translate to {target_lang}. Output ONLY translation.", - }, - { - "key": "document_chat_system", - # Translators: Section header for document-related prompts in Prompt Manager. - "section": _("Document"), - # Translators: Label for the initial context prompt in document chat. - "label": _("Document Chat Context"), - "prompt": "STRICTLY Respond in {response_lang}. Use Markdown formatting. Analyze the attached content to answer.", - }, - { - "key": "document_chat_ack", - # Translators: Section header for advanced/internal prompts in Prompt Manager. - "section": _("Advanced"), - # Translators: Label for the AI's acknowledgement reply in document chat. - "label": _("Document Chat Bootstrap Reply"), - "internal": True, - "prompt": "Context received. Ready for questions.", - }, - { - "key": "vision_navigator_object", - # Translators: Section header for image analysis prompts in Prompt Manager. - "section": _("Vision"), - # Translators: Label for the prompt used to analyze the current navigator object. - "label": _("Navigator Object Analysis"), - "prompt": ( - "Analyze this image. Describe the layout, visible text, and UI elements. " - "Use Markdown formatting (headings, lists) to organize the description. " - "Language: {response_lang}. Ensure the response is strictly in {response_lang}. " - "IMPORTANT: Start directly with the description content. Do not add introductory " - "sentences like 'Here is the analysis' or 'The image shows'." - ), - }, - { - "key": "vision_fullscreen", - # Translators: Section header for image analysis prompts in Prompt Manager. - "section": _("Vision"), - # Translators: Label for the prompt used to analyze the entire screen. - "label": _("Full Screen Analysis"), - "prompt": ( - "Analyze this image. Describe the layout, visible text, and UI elements. " - "Use Markdown formatting (headings, lists) to organize the description. " - "Language: {response_lang}. Ensure the response is strictly in {response_lang}. " - "IMPORTANT: Start directly with the description content. Do not add introductory " - "sentences like 'Here is the analysis' or 'The image shows'." - ), - }, - { - "key": "vision_followup_context", - # Translators: Section header for advanced/internal prompts in Prompt Manager. - "section": _("Advanced"), - # Translators: Label for the follow-up context in image analysis chat. - "label": _("Vision Follow-up Context"), - "internal": True, - "prompt": "Image Context. Target Language: {response_lang}", - }, - { - "key": "vision_followup_suffix", - # Translators: Section header for advanced/internal prompts in Prompt Manager. - "section": _("Advanced"), - # Translators: Label for the rule enforced during image analysis follow-up questions. - "label": _("Vision Follow-up Answer Rule"), - "internal": True, - "prompt": "Answer strictly in {response_lang}", - }, - { - "key": "video_analysis", - # Translators: Section header for video analysis prompts in Prompt Manager. - "section": _("Video"), - # Translators: Label for the video content analysis prompt. - "label": _("Video Analysis"), - "prompt": ( - "Analyze this video. Provide a detailed description of the visual content and a " - "summary of the audio. IMPORTANT: Write the entire response STRICTLY in " - "{response_lang} language." - ), - }, - { - "key": "audio_transcription", - # Translators: Section header for audio-related prompts in Prompt Manager. - "section": _("Audio"), - # Translators: Label for the audio file transcription prompt. - "label": _("Audio Transcription"), - "prompt": "Transcribe this audio in {response_lang}.", - }, - { - "key": "dictation_transcribe", - # Translators: Section header for audio-related prompts in Prompt Manager. - "section": _("Audio"), - # Translators: Label for the smart voice dictation prompt. - "label": _("Smart Dictation"), - "prompt": ( - "Transcribe speech. Use native script. Fix stutters. If there is no speech, silence, " - "or background noise only, write exactly: [[[NOSPEECH]]]" - ), - }, - { - "key": "ocr_image_extract", - # Translators: Section header for OCR-related prompts in Prompt Manager. - "section": _("OCR"), - # Translators: Label for the OCR prompt used for image text extraction. - "label": _("OCR Image Extraction"), - "prompt": ( - "Extract all visible text from this image. Strictly preserve original formatting " - "(headings, lists, tables) using Markdown. Do not output any system messages or " - "code block backticks (```). Output ONLY the raw content." - ), - }, - { - "key": "ocr_document_extract", - # Translators: Section header for OCR-related prompts in Prompt Manager. - "section": _("OCR"), - # Translators: Label for the OCR prompt used for document text extraction. - "label": _("OCR Document Extraction"), - "prompt": ( - "Extract all visible text from this document. Strictly preserve original formatting " - "(headings, lists, tables) using Markdown. You MUST insert the exact delimiter " - "'[[[PAGE_SEP]]]' immediately after the content of every single page. Do not output " - "any system messages or code block backticks (```). Output ONLY the raw content." - ), - }, - { - "key": "ocr_document_translate", - # Translators: Section header for document-related prompts in Prompt Manager. - "section": _("Document"), - # Translators: Label for the combined OCR and translation prompt for documents. - "label": _("Document OCR + Translate"), - "prompt": ( - "Extract all text from this document. Preserve formatting (Markdown). Then translate " - "the content to {target_lang}. Output ONLY the translated content. Do not add " - "explanations." - ), - }, - { - "key": "captcha_solver_base", - # Translators: Section header for CAPTCHA-related prompts in Prompt Manager. - "section": _("CAPTCHA"), - # Translators: Label for the CAPTCHA solving prompt. - "label": _("CAPTCHA Solver"), - "internal": True, - "prompt": ( - "Blind user. Return CAPTCHA code only. If NO CAPTCHA is detected in the image, " - "strictly return: [[[NO_CAPTCHA]]].{captcha_extra}" - ), - }, - { - "key": "refine_files_only", - # Translators: Section header for advanced/internal prompts in Prompt Manager. - "section": _("Advanced"), - # Translators: Label for the fallback prompt when only files are provided in Refine. - "label": _("Refine Files-Only Fallback"), - "internal": True, - "prompt": "Analyze these files.", - }, +from .constants import ( + ADDON_NAME, + CHROME_OCR_KEYS, + GEMINI_VOICES, + GITHUB_REPO, + MODELS, + OCR_ENGINES, + PROMPT_VARIABLES_GUIDE, + REFINE_PROMPT_KEYS, + SOURCE_NAMES, + TARGET_CODES, + TARGET_NAMES, ) - -PROMPT_VARIABLES_GUIDE = ( - # Translators: Description and input type for the [selection] variable in the Variables Guide. - ("[selection]", _("Currently selected text"), _("Text")), - # Translators: Description for the [clipboard] variable in the Variables Guide. - ("[clipboard]", _("Clipboard content"), _("Text")), - # Translators: Description and input type for the [screen_obj] variable in the Variables Guide. - ("[screen_obj]", _("Screenshot of the navigator object"), _("Image")), - # Translators: Description for the [screen_full] variable in the Variables Guide. - ("[screen_full]", _("Screenshot of the entire screen"), _("Image")), - # Translators: Description and input type for the [file_ocr] variable in the Variables Guide. - ("[file_ocr]", _("Select image/PDF/TIFF for text extraction"), _("Image, PDF, TIFF")), - # Translators: Description and input type for the [file_read] variable in the Variables Guide. - ("[file_read]", _("Select document for reading"), _("TXT, Code, PDF")), - # Translators: Description and input type for the [file_audio] variable in the Variables Guide. - ("[file_audio]", _("Select audio file for analysis"), _("MP3, WAV, OGG")), +from .markdown_utils import clean_markdown, markdown_to_html +from .prompt_helpers import ( + apply_prompt_template, + get_builtin_default_prompts, + get_builtin_default_prompt_map, + get_configured_default_prompt_map, + get_configured_default_prompts, + get_prompt_text, + get_refine_menu_options, + load_configured_custom_prompts, + migrate_prompt_config_if_needed, + serialize_default_prompt_overrides, + serialize_custom_prompts_v2, ) # --- Helpers --- -def get_builtin_default_prompts(): - builtins = [] - for item in DEFAULT_SYSTEM_PROMPTS: - p = str(item["prompt"]).strip() - builtins.append({ - "key": item["key"], - "section": item["section"], - "label": item["label"], - "display_label": f"{item['section']} - {item['label']}", - "internal": bool(item.get("internal")), - "prompt": p, - "default": p, - }) - return builtins - -def get_builtin_default_prompt_map(): - return {item["key"]: item for item in get_builtin_default_prompts()} - -def _normalize_custom_prompt_items(items): - normalized = [] - if not isinstance(items, list): - return normalized - - for item in items: - if not isinstance(item, dict): - continue - name = item.get("name") - content = item.get("content") - if not isinstance(name, str) or not isinstance(content, str): - continue - name = name.strip() - content = content.strip() - if name and content: - normalized.append({"name": name, "content": content}) - return normalized - -def parse_custom_prompts_legacy(raw_value): - items = [] - if not raw_value: - return items - - normalized = raw_value.replace("\r\n", "\n").replace("\r", "\n") - for line in normalized.split("\n"): - for segment in line.split("|"): - segment = segment.strip() - if not segment or ":" not in segment: - continue - name, content = segment.split(":", 1) - name = name.strip() - content = content.strip() - if name and content: - items.append({"name": name, "content": content}) - return items - -def parse_custom_prompts_v2(raw_value): - if not isinstance(raw_value, str) or not raw_value.strip(): - return None - try: - data = json.loads(raw_value) - except Exception as e: - log.warning(f"Invalid custom_prompts_v2 config, falling back to legacy format: {e}") - return None - return _normalize_custom_prompt_items(data) - -def serialize_custom_prompts_v2(items): - normalized = _normalize_custom_prompt_items(items) - if not normalized: - return "" - return json.dumps(normalized, ensure_ascii=False) - -def load_configured_custom_prompts(): - try: - raw_v2 = config.conf["VisionAssistant"]["custom_prompts_v2"] - except Exception: - raw_v2 = "" - items_v2 = parse_custom_prompts_v2(raw_v2) - if items_v2 is not None: - return items_v2 - return parse_custom_prompts_legacy(config.conf["VisionAssistant"]["custom_prompts"]) - -def _sanitize_default_prompt_overrides(data): - if not isinstance(data, dict): - return {}, False - - changed = False - mutable = dict(data) - # Migrate old key used in previous versions. - legacy_vision = mutable.pop("vision_image_analysis", None) - if legacy_vision is not None: - changed = True - if isinstance(legacy_vision, str) and legacy_vision.strip(): - legacy_text = legacy_vision.strip() - nav_value = mutable.get("vision_navigator_object") - if not isinstance(nav_value, str) or not nav_value.strip(): - mutable["vision_navigator_object"] = legacy_text - changed = True - full_value = mutable.get("vision_fullscreen") - if not isinstance(full_value, str) or not full_value.strip(): - mutable["vision_fullscreen"] = legacy_text - changed = True - - valid_keys = set(get_builtin_default_prompt_map().keys()) - sanitized = {} - for key, value in mutable.items(): - if key not in valid_keys or not isinstance(value, str): - changed = True - continue - prompt_text = value.strip() - if not prompt_text: - changed = True - continue - if key in LEGACY_REFINER_TOKENS and prompt_text == LEGACY_REFINER_TOKENS[key]: - # Drop old token-only overrides and fallback to current built-ins. - changed = True - continue - if prompt_text != value: - changed = True - sanitized[key] = prompt_text - return sanitized, changed - -def migrate_prompt_config_if_needed(): - changed = False - - try: - raw_v2 = config.conf["VisionAssistant"]["custom_prompts_v2"] - except Exception: - raw_v2 = "" - raw_legacy = config.conf["VisionAssistant"]["custom_prompts"] - - v2_items = parse_custom_prompts_v2(raw_v2) - if v2_items is None: - target_items = parse_custom_prompts_legacy(raw_legacy) - else: - target_items = v2_items - - serialized_v2 = serialize_custom_prompts_v2(target_items) - if serialized_v2 != (raw_v2 or ""): - config.conf["VisionAssistant"]["custom_prompts_v2"] = serialized_v2 - changed = True - - # Legacy mirror is disabled. Clear old storage to prevent stale fallback data. - if raw_legacy: - config.conf["VisionAssistant"]["custom_prompts"] = "" - changed = True - - try: - raw_defaults = config.conf["VisionAssistant"]["default_refine_prompts"] - except Exception: - raw_defaults = "" - if isinstance(raw_defaults, str) and raw_defaults.strip(): - try: - defaults_data = json.loads(raw_defaults) - except Exception: - defaults_data = None - if isinstance(defaults_data, dict): - sanitized, migrated = _sanitize_default_prompt_overrides(defaults_data) - if migrated: - config.conf["VisionAssistant"]["default_refine_prompts"] = ( - json.dumps(sanitized, ensure_ascii=False) if sanitized else "" - ) - changed = True - - return changed - -def load_default_prompt_overrides(): - try: - raw = config.conf["VisionAssistant"]["default_refine_prompts"] - except Exception: - raw = "" - if not isinstance(raw, str) or not raw.strip(): - return {} - - try: - data = json.loads(raw) - except Exception as e: - log.warning(f"Invalid default_refine_prompts config, using built-ins: {e}") - return {} - - overrides, _ = _sanitize_default_prompt_overrides(data) - return overrides - -def get_configured_default_prompt_map(): - prompt_map = get_builtin_default_prompt_map() - overrides = load_default_prompt_overrides() - for key, override in overrides.items(): - if key not in prompt_map: - continue - if key in LEGACY_REFINER_TOKENS and override == LEGACY_REFINER_TOKENS[key]: - continue - prompt_map[key]["prompt"] = override - return prompt_map - -def get_configured_default_prompts(): - prompt_map = get_configured_default_prompt_map() - items = [] - for item in DEFAULT_SYSTEM_PROMPTS: - if item.get("internal"): - continue - key = item["key"] - if key in prompt_map: - items.append(dict(prompt_map[key])) - items.sort(key=lambda item: item.get("display_label", "").casefold()) - return items - -def get_prompt_text(prompt_key): - prompt_map = get_configured_default_prompt_map() - item = prompt_map.get(prompt_key) - if item: - return item["prompt"] - return "" - -def serialize_default_prompt_overrides(items): - if not items: - return "" - - base_map = {item["key"]: item["prompt"] for item in get_builtin_default_prompts()} - overrides = {} - for item in items: - key = item.get("key") - prompt_text = item.get("prompt", "") - if key not in base_map: - continue - if not isinstance(prompt_text, str): - continue - prompt_text = prompt_text.strip() - if prompt_text and prompt_text != base_map[key]: - overrides[key] = prompt_text - - if not overrides: - return "" - return json.dumps(overrides, ensure_ascii=False) - -def get_refine_menu_options(): - options = [] - prompt_map = get_configured_default_prompt_map() - for key in REFINE_PROMPT_KEYS: - item = prompt_map.get(key) - if item: - options.append((item["label"], item["prompt"])) - - for item in load_configured_custom_prompts(): - # Translators: Prefix for custom prompts in the Refine menu - options.append((_("Custom: ") + item["name"], item["content"])) - return options - -def apply_prompt_template(template, replacements): - if not isinstance(template, str): - return "" - - text = template - for key, value in replacements: - text = text.replace("{" + key + "}", str(value)) - - return text.strip() - def finally_(func, final): @wraps(func) def new(*args, **kwargs): @@ -706,1661 +88,34 @@ def new(*args, **kwargs): final() return new -def clean_markdown(text): - if not text: return "" - text = re.sub(r'\*\*|__|[*_]', '', text) - text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE) - text = re.sub(r'```', '', text) - text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) - text = re.sub(r'^\s*-\s+', '', text, flags=re.MULTILINE) - return text.strip() - -def markdown_to_html(text, full_page=False): - if not text: return "" - - html = text.replace("&", "&").replace("<", "<").replace(">", ">") - html = re.sub(r'\*\*(.*?)\*\*', r'\1', html) - html = re.sub(r'__(.*?)__', r'\1', html) - html = re.sub(r'^### (.*)', r'

\1

', html, flags=re.M) - html = re.sub(r'^## (.*)', r'

\1

', html, flags=re.M) - html = re.sub(r'^# (.*)', r'

\1

', html, flags=re.M) - - lines = html.split('\n') - in_table = False - new_lines = [] - table_style = 'border="1" style="border-collapse: collapse; width: 100%; margin-bottom: 10px;"' - td_style = 'style="padding: 5px; border: 1px solid #ccc;"' - - for line in lines: - stripped = line.strip() - if stripped.startswith('|') or (stripped.count('|') > 1 and len(stripped) > 5): - if not in_table: - new_lines.append(f'') - in_table = True - if '---' in stripped: continue - row_content = stripped.strip('|').split('|') - cells = "".join([f'' for c in row_content]) - new_lines.append(f'{cells}') - else: - if in_table: - new_lines.append('
{c.strip()}
') - in_table = False - if stripped: new_lines.append(line + "
") - else: new_lines.append("
") - if in_table: new_lines.append('') - html_body = "".join(new_lines) - - if not full_page: return html_body - return f"""{html_body}""" - -def get_mime_type(path): - ext = os.path.splitext(path)[1].lower() - if ext == '.pdf': return 'application/pdf' - if ext in ['.jpg', '.jpeg']: return 'image/jpeg' - if ext == '.png': return 'image/png' - if ext == '.webp': return 'image/webp' - if ext in ['.tif', '.tiff']: return 'image/jpeg' - if ext == '.mp3': return 'audio/mpeg' - if ext == '.wav': return 'audio/wav' - if ext == '.ogg': return 'audio/ogg' - if ext == '.mp4': return 'video/mp4' - return 'application/octet-stream' - -def show_error_dialog(message): - # Translators: Title of the error dialog box - title = _("{name} Error").format(name=ADDON_NAME) - wx.CallAfter(gui.messageBox, message, title, wx.OK | wx.ICON_ERROR) - -def send_ctrl_v(): - try: - user32 = ctypes.windll.user32 - VK_CONTROL = 0x11; VK_V = 0x56; KEYEVENTF_KEYUP = 0x0002 - user32.keybd_event(VK_CONTROL, 0, 0, 0) - user32.keybd_event(VK_V, 0, 0, 0) - user32.keybd_event(VK_V, 0, KEYEVENTF_KEYUP, 0) - user32.keybd_event(VK_CONTROL, 0, KEYEVENTF_KEYUP, 0) - except: pass - -def get_proxy_opener(): - proxy_url = config.conf["VisionAssistant"]["proxy_url"].strip() - if proxy_url: - if "127.0.0.1" in proxy_url or "localhost" in proxy_url or ":" in proxy_url.split("/")[-1]: - handler = request.ProxyHandler({'http': proxy_url, 'https': proxy_url}) - return request.build_opener(handler) - return request.build_opener() - -def get_twitter_download_link(tweet_url): - cj = cookiejar.CookieJar() - opener = request.build_opener(request.HTTPCookieProcessor(cj)) - base_url = "https://savetwitter.net/en4" - api_url = "https://savetwitter.net/api/ajaxSearch" - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'X-Requested-With': 'XMLHttpRequest', 'Referer': base_url} - try: - req_init = request.Request(base_url, headers=headers) - opener.open(req_init) - params = {'q': tweet_url, 'lang': 'en', 'cftoken': ''} - data = urlencode(params).encode('utf-8') - req_post = request.Request(api_url, data=data, headers=headers, method='POST') - with opener.open(req_post) as response: - res_data = json.loads(response.read().decode('utf-8')) - if res_data.get('status') == 'ok': - html = res_data.get('data', '') - match = re.search(r'href="(https?://dl\.snapcdn\.app/[^"]+)"', html) - if match: return match.group(1) - except: pass - return None - -def get_instagram_download_link(insta_url): - cj = cookiejar.CookieJar() - opener = request.build_opener(request.HTTPCookieProcessor(cj)) - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36', - 'X-Requested-With': 'XMLHttpRequest', - 'Referer': 'https://anon-viewer.com/', - 'Accept': '*/*' - } - opener.addheaders = list(headers.items()) - try: - opener.open("https://anon-viewer.com/", timeout=30) - - if "/stories/" in insta_url: - parts = insta_url.split("/") - username = parts[parts.index("stories") + 1] - api_url = f"https://anon-viewer.com/content.php?url={username}&method=allstories" - else: - encoded_url = quote(insta_url, safe='') - api_url = f"https://anon-viewer.com/content.php?url={encoded_url}" - - response = opener.open(api_url, timeout=60) - if response.getcode() == 200: - res_content = response.read().decode('utf-8') - data = json.loads(res_content) - html_text = data.get('html', '') - - match = re.search(r'href="([^"]+anon-viewer\.com/media\.php\?media=[^"]+)"', html_text) - if match: - return match.group(1).replace('&', '&') - - source_match = re.search(r' 0: - result_parts = [x[0] for x in data[0] if x[0]] - return "".join(result_parts) - except Exception as e: - log.error(f"Google Translate Failed: {e}", exc_info=True) - return text - return text - -class GeminiHandler: - _working_key_idx = 0 - _file_uri_keys = {} - _max_retries = 5 - - @staticmethod - def _get_api_keys(): - raw = config.conf["VisionAssistant"]["api_key"] - clean_raw = raw.replace('\r\n', ',').replace('\n', ',') - return [k.strip() for k in clean_raw.split(',') if k.strip()] - - @staticmethod - def _get_opener(): - return get_proxy_opener() - - @staticmethod - def _handle_error(e): - if hasattr(e, 'code'): - # Translators: Error message for Bad Request (400) - if e.code == 400: return _("Error 400: Bad Request (Check API Key)") - # Translators: Error message for Forbidden (403) - if e.code == 403: return _("Error 403: Forbidden (Check Region)") - if e.code == 429: return "QUOTA_EXCEEDED" - if e.code >= 500: return "SERVER_ERROR" - return str(e) - - @staticmethod - def _call_with_retry(func_logic, key, *args): - last_exc = None - for attempt in range(GeminiHandler._max_retries): - try: - return func_logic(key, *args) - except error.HTTPError as e: - err_msg = GeminiHandler._handle_error(e) - if err_msg not in ["QUOTA_EXCEEDED", "SERVER_ERROR"]: - raise - last_exc = e - except error.URLError as e: - last_exc = e - if attempt < GeminiHandler._max_retries - 1: - time.sleep(0.5 * (attempt + 1)) - raise last_exc - - @staticmethod - def _register_file_uri(uri, key): - if uri and key: - GeminiHandler._file_uri_keys[uri] = key - while len(GeminiHandler._file_uri_keys) > 200: - GeminiHandler._file_uri_keys.pop(next(iter(GeminiHandler._file_uri_keys))) - - @staticmethod - def _get_registered_key(uri): - if not uri: - return None - return GeminiHandler._file_uri_keys.get(uri) - - @staticmethod - def _call_with_key(func_logic, key, *args): - try: - return GeminiHandler._call_with_retry(func_logic, key, *args) - except error.HTTPError as e: - err_msg = GeminiHandler._handle_error(e) - if err_msg == "QUOTA_EXCEEDED": - # Translators: Message of a dialog which may pop up while performing an AI call - err_msg = _("Error 429: Quota Exceeded (Try later)") - elif err_msg == "SERVER_ERROR": - # Translators: Message of a dialog which may pop up while performing an AI call - err_msg = _("Server Error {code}: {reason}").format(code=e.code, reason=e.reason) - return "ERROR:" + err_msg - except Exception as e: - return "ERROR:" + str(e) - - @staticmethod - def _call_with_rotation(func_logic, *args): - keys = GeminiHandler._get_api_keys() - if not keys: - # Translators: Error when no API keys are found in settings - return "ERROR:" + _("No API Keys configured.") - - num_keys = len(keys) - for i in range(num_keys): - idx = (GeminiHandler._working_key_idx + i) % num_keys - key = keys[idx] - try: - res = GeminiHandler._call_with_retry(func_logic, key, *args) - GeminiHandler._working_key_idx = idx - return res - except error.HTTPError as e: - err_msg = GeminiHandler._handle_error(e) - if err_msg in ["QUOTA_EXCEEDED", "SERVER_ERROR"]: - if i < num_keys - 1: continue - # Translators: Error when all available API keys fail - return "ERROR:" + _("All API Keys failed (Quota/Server).") - return "ERROR:" + err_msg - except Exception as e: - return "ERROR:" + str(e) - return "ERROR:" + _("Unknown error occurred.") - - @staticmethod - def translate(text, target_lang): - def _logic(key, txt, lang): - model = config.conf["VisionAssistant"]["model_name"] - url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent" - quick_template = get_prompt_text("translate_quick") or "Translate to {target_lang}. Output ONLY translation." - quick_prompt = apply_prompt_template(quick_template, [("target_lang", lang)]) - payload = {"contents": [{"parts": [{"text": quick_prompt}, {"text": txt}]}]} - req = request.Request(url, data=json.dumps(payload).encode('utf-8'), headers={"Content-Type": "application/json", "x-goog-api-key": key}) - with GeminiHandler._get_opener().open(req, timeout=90) as r: - return json.loads(r.read().decode())['candidates'][0]['content']['parts'][0]['text'] - return GeminiHandler._call_with_rotation(_logic, text, target_lang) - - @staticmethod - def ocr_page(image_bytes): - def _logic(key, img_data): - model = config.conf["VisionAssistant"]["model_name"] - url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent" - ocr_image_prompt = get_prompt_text("ocr_image_extract") - payload = {"contents": [{"parts": [{"inline_data": {"mime_type": "image/jpeg", "data": base64.b64encode(img_data).decode('utf-8')}}, {"text": ocr_image_prompt}]}]} - req = request.Request(url, data=json.dumps(payload).encode('utf-8'), headers={"Content-Type": "application/json", "x-goog-api-key": key}) - with GeminiHandler._get_opener().open(req, timeout=120) as r: - return json.loads(r.read().decode())['candidates'][0]['content']['parts'][0]['text'] - return GeminiHandler._call_with_rotation(_logic, image_bytes) - - @staticmethod - def upload_and_process_batch(file_path, mime_type, page_count): - keys = GeminiHandler._get_api_keys() - if not keys: - # Translators: Error message for missing API Keys - return [ "ERROR:" + _("No API Keys.") ] - model = config.conf["VisionAssistant"]["model_name"] - - opener = GeminiHandler._get_opener() - proxy_url = config.conf["VisionAssistant"]["proxy_url"].strip() - base_url = proxy_url.rstrip('/') if proxy_url else "https://generativelanguage.googleapis.com" - - for i, key in enumerate(keys): - try: - f_size = os.path.getsize(file_path) - init_url = f"{base_url}/upload/v1beta/files" - headers = {"X-Goog-Upload-Protocol": "resumable", "X-Goog-Upload-Command": "start", "X-Goog-Upload-Header-Content-Length": str(f_size), "X-Goog-Upload-Header-Content-Type": mime_type, "Content-Type": "application/json", "x-goog-api-key": key} - - req = request.Request(init_url, data=json.dumps({"file": {"display_name": "batch"}}).encode(), headers=headers, method="POST") - with opener.open(req, timeout=120) as r: upload_url = r.headers.get("x-goog-upload-url") - - with open(file_path, 'rb') as f: f_data = f.read() - req_up = request.Request(upload_url, data=f_data, headers={"Content-Length": str(f_size), "X-Goog-Upload-Offset": "0", "X-Goog-Upload-Command": "upload, finalize"}, method="POST") - with opener.open(req_up, timeout=180) as r: - res = json.loads(r.read().decode()) - uri, name = res['file']['uri'], res['file']['name'] - - active = False - for attempt in range(30): - req_check = request.Request(f"{base_url}/v1beta/{name}", headers={"x-goog-api-key": key}) - with opener.open(req_check, timeout=30) as r: - state = json.loads(r.read().decode()).get('state') - if state == "ACTIVE": - active = True - break - if state == "FAILED": - break - time.sleep(2) - - if not active: - if i < len(keys) - 1: - continue - return [ "ERROR:" + _("Upload failed.") ] - - GeminiHandler._register_file_uri(uri, key) - - url = f"{base_url}/v1beta/models/{model}:generateContent" - prompt = get_prompt_text("ocr_document_extract") - contents = [{"parts": [{"file_data": {"mime_type": mime_type, "file_uri": uri}}, {"text": prompt}]}] - - req_gen = request.Request(url, data=json.dumps({"contents": contents}).encode(), headers={"Content-Type": "application/json", "x-goog-api-key": key}) - with opener.open(req_gen, timeout=180) as r: - res = json.loads(r.read().decode()) - text = res['candidates'][0]['content']['parts'][0]['text'] - return text.split('[[[PAGE_SEP]]]') - - except error.HTTPError as e: - err_code = GeminiHandler._handle_error(e) - if err_code in ["QUOTA_EXCEEDED", "SERVER_ERROR"] and i < len(keys) - 1: - continue - if err_code == "QUOTA_EXCEEDED": - # Translators: Message of a dialog which may pop up while performing an AI call - err_msg = _("Error 429: Quota Exceeded (Try later)") - elif err_code == "SERVER_ERROR": - # Translators: Message of a dialog which may pop up while performing an AI call - err_msg = _("Server Error {code}: {reason}").format(code=e.code, reason=e.reason) - else: - err_msg = err_code - return ["ERROR:" + err_msg] - except Exception as e: - return ["ERROR:" + str(e)] - return ["ERROR:" + _("All keys failed.")] - - @staticmethod - def chat(history, new_msg, file_uri, mime_type): - def _logic(key, hist, msg, uri, mime): - model = config.conf["VisionAssistant"]["model_name"] - proxy_url = config.conf["VisionAssistant"]["proxy_url"].strip() - base_url = proxy_url.rstrip('/') if proxy_url else "https://generativelanguage.googleapis.com" - url = f"{base_url}/v1beta/models/{model}:generateContent" - - contents = list(hist) - if uri: - user_parts = [{"file_data": {"mime_type": mime, "file_uri": uri}}] - else: - user_parts = [] - user_parts.append({"text": msg}) - contents.append({"role": "user", "parts": user_parts}) - - req = request.Request(url, data=json.dumps({"contents": contents}).encode(), headers={"Content-Type": "application/json", "x-goog-api-key": key}) - with GeminiHandler._get_opener().open(req, timeout=120) as r: - return json.loads(r.read().decode())['candidates'][0]['content']['parts'][0]['text'] - forced_key = GeminiHandler._get_registered_key(file_uri) if file_uri else None - if forced_key: - return GeminiHandler._call_with_key(_logic, forced_key, history, new_msg, file_uri, mime_type) - return GeminiHandler._call_with_rotation(_logic, history, new_msg, file_uri, mime_type) - - @staticmethod - def upload_for_chat(file_path, mime_type): - keys = GeminiHandler._get_api_keys() - if not keys: return None - opener = GeminiHandler._get_opener() - proxy_url = config.conf["VisionAssistant"]["proxy_url"].strip() - base_url = proxy_url.rstrip('/') if proxy_url else "https://generativelanguage.googleapis.com" - - for key in keys: - try: - f_size = os.path.getsize(file_path) - init_url = f"{base_url}/upload/v1beta/files" - headers = {"X-Goog-Upload-Protocol": "resumable", "X-Goog-Upload-Command": "start", "X-Goog-Upload-Header-Content-Length": str(f_size), "X-Goog-Upload-Header-Content-Type": mime_type, "Content-Type": "application/json", "x-goog-api-key": key} - req = request.Request(init_url, data=json.dumps({"file": {"display_name": os.path.basename(file_path)}}).encode(), headers=headers, method="POST") - with opener.open(req, timeout=120) as r: upload_url = r.headers.get("x-goog-upload-url") - with open(file_path, 'rb') as f: f_data = f.read() - req_up = request.Request(upload_url, data=f_data, headers={"Content-Length": str(f_size), "X-Goog-Upload-Offset": "0", "X-Goog-Upload-Command": "upload, finalize"}, method="POST") - with opener.open(req_up, timeout=180) as r: - res = json.loads(r.read().decode()) - uri, name = res['file']['uri'], res['file']['name'] - for attempt in range(30): - req_check = request.Request(f"{base_url}/v1beta/{name}", headers={"x-goog-api-key": key}) - with opener.open(req_check, timeout=30) as r: - state = json.loads(r.read().decode()).get('state') - if state == "ACTIVE": - GeminiHandler._register_file_uri(uri, key) - return uri - time.sleep(2) - return None - except: continue - return None - - @staticmethod - def generate_speech(text, voice_name): - def _logic(key, txt, voice): - main_model = config.conf["VisionAssistant"]["model_name"] - if "pro" in main_model.lower(): - tts_model = "gemini-2.5-pro-preview-tts" - else: - tts_model = "gemini-2.5-flash-preview-tts" - - proxy_url = config.conf["VisionAssistant"]["proxy_url"].strip() - base_url = proxy_url.rstrip('/') if proxy_url else "https://generativelanguage.googleapis.com" - url = f"{base_url}/v1beta/models/{tts_model}:generateContent" - - payload = { - "contents": [{"parts": [{"text": txt}]}], - "generationConfig": { - "responseModalities": ["AUDIO"], - "speechConfig": {"voiceConfig": {"prebuiltVoiceConfig": {"voiceName": voice}}} - } - } - req = request.Request(url, data=json.dumps(payload).encode('utf-8'), headers={"Content-Type": "application/json", "x-goog-api-key": key}) - with GeminiHandler._get_opener().open(req, timeout=600) as r: - res = json.loads(r.read().decode()) - candidates = res.get('candidates', []) - if not candidates: raise Exception("No candidates returned") - content = candidates[0].get('content', {}) - parts = content.get('parts', []) - if not parts: raise Exception("No parts in response") - part = parts[0] - if 'inlineData' in part: return part['inlineData']['data'] - if 'inline_data' in part: return part['inline_data']['data'] - if 'text' in part: raise Exception(f"Model refused audio: {part['text']}") - raise Exception("Unknown response format") - return GeminiHandler._call_with_rotation(_logic, text, voice_name) +from .services import ( + ChromeOCREngine, + GeminiHandler, + GoogleTranslator, + SmartProgrammersOCREngine, + VirtualDocument, + _download_temp_video, + get_file_path, + get_instagram_download_link, + get_mime_type, + get_proxy_opener, + get_tiktok_download_link, + get_twitter_download_link, + send_ctrl_v, + show_error_dialog, +) # --- Update Manager --- -class UpdateDialog(wx.Dialog): - def __init__(self, parent, version, name, changes): - # Translators: Title of update confirmation dialog - super().__init__(parent, title=_("Update Available"), size=(500, 450)) - self.Centre() - - panel = wx.Panel(self) - vbox = wx.BoxSizer(wx.VERTICAL) - - # Translators: Message asking user to update. {version} is version number. - msg = _("A new version ({version}) of {name} is available.").format(version=version, name=name) - header = wx.StaticText(panel, label=msg) - vbox.Add(header, 0, wx.ALL, 15) - - # Translators: Label for the changes text box - change_lbl = wx.StaticText(panel, label=_("Changes:")) - vbox.Add(change_lbl, 0, wx.LEFT | wx.RIGHT, 15) - - self.changes_ctrl = wx.TextCtrl(panel, value=changes, style=wx.TE_MULTILINE | wx.TE_READONLY | wx.TE_RICH2) - vbox.Add(self.changes_ctrl, 1, wx.EXPAND | wx.ALL, 15) - - # Translators: Question to download and install - question = wx.StaticText(panel, label=_("Download and Install?")) - vbox.Add(question, 0, wx.LEFT | wx.RIGHT | wx.BOTTOM, 15) - - btn_sizer = wx.BoxSizer(wx.HORIZONTAL) - # Translators: Button to accept update - self.yes_btn = wx.Button(panel, wx.ID_YES, label=_("&Yes")) - # Translators: Button to reject update - self.no_btn = wx.Button(panel, wx.ID_NO, label=_("&No")) - - btn_sizer.Add(self.yes_btn, 0, wx.RIGHT, 10) - btn_sizer.Add(self.no_btn, 0) - vbox.Add(btn_sizer, 0, wx.ALIGN_RIGHT | wx.ALL, 15) - - panel.SetSizer(vbox) - self.yes_btn.SetDefault() - self.yes_btn.Bind(wx.EVT_BUTTON, lambda e: self.EndModal(wx.ID_YES)) - self.no_btn.Bind(wx.EVT_BUTTON, lambda e: self.EndModal(wx.ID_NO)) - -class UpdateManager: - def __init__(self, repo_name): - self.repo_name = repo_name - self.current_version = addonHandler.getCodeAddon().manifest['version'] - - def check_for_updates(self, silent=True): - threading.Thread(target=self._check_thread, args=(silent,), daemon=True).start() - - def _check_thread(self, silent): - try: - url = f"https://api.github.com/repos/{self.repo_name}/releases/latest" - req = request.Request(url, headers={"User-Agent": "NVDA-Addon"}) - with request.urlopen(req, timeout=60) as response: - if response.status == 200: - data = json.loads(response.read().decode('utf-8')) - latest_tag = data.get("tag_name", "").lstrip("v") - if self._compare_versions(latest_tag, self.current_version) > 0: - download_url = None - for asset in data.get("assets", []): - if asset["name"].endswith(".nvda-addon"): - download_url = asset["browser_download_url"] - break - if download_url: - raw_changes = data.get("body", "") - - clean_changes = re.split(r'SHA256|Checklist|---', raw_changes, flags=re.I)[0].strip() - clean_changes = clean_markdown(clean_changes) - - wx.CallAfter(self._prompt_update, latest_tag, download_url, clean_changes) - elif not silent: - # Translators: Error message when an update is found but the addon file is missing from GitHub. - msg = _("Update found but no .nvda-addon file in release.") - show_error_dialog(msg) - elif not silent: - # Translators: Status message informing the user they are already on the latest version. - msg = _("You have the latest version.") - wx.CallAfter(ui.message, msg) - except Exception as e: - if not silent: - msg = _("Update check failed: {error}").format(error=e) - show_error_dialog(msg) - - def _compare_versions(self, v1, v2): - try: - parts1 = [int(x) for x in v1.split('.')] - parts2 = [int(x) for x in v2.split('.')] - return (parts1 > parts2) - (parts1 < parts2) - except: return 0 if v1 == v2 else 1 - - def _prompt_update(self, version, url, changes): - dlg = UpdateDialog(gui.mainFrame, version, ADDON_NAME, changes) - if dlg.ShowModal() == wx.ID_YES: - threading.Thread(target=self._download_install_worker, args=(url,), daemon=True).start() - dlg.Destroy() - - def _download_install_worker(self, url): - try: - # Translators: Message shown while downloading update - msg = _("Downloading update...") - wx.CallAfter(ui.message, msg) - temp_dir = tempfile.gettempdir() - file_path = os.path.join(temp_dir, "VisionAssistant_Update.nvda-addon") - with request.urlopen(url) as response, open(file_path, 'wb') as out_file: - out_file.write(response.read()) - wx.CallAfter(os.startfile, file_path) - except Exception as e: - # Translators: Error message for download failure - msg = _("Download failed: {error}").format(error=e) - show_error_dialog(msg) - -# --- UI Classes --- - - -class VisionQADialog(wx.Dialog): - def __init__(self, parent, title, initial_text, context_data, callback_fn, extra_info=None, raw_content=None, status_callback=None, announce_on_open=True, allow_questions=True): - super(VisionQADialog, self).__init__(parent, title=title, size=(550, 500), style=wx.DEFAULT_DIALOG_STYLE | wx.RESIZE_BORDER) - self.context_data = context_data - self.callback_fn = callback_fn - self.extra_info = extra_info - self.chat_history = [] - self.raw_content = raw_content - self.status_callback = status_callback - self.announce_on_open = announce_on_open - self.allow_questions = allow_questions - - mainSizer = wx.BoxSizer(wx.VERTICAL) - # Translators: Label for the AI response text area in a chat dialog - lbl_text = _("AI Response:") - lbl = wx.StaticText(self, label=lbl_text) - mainSizer.Add(lbl, 0, wx.ALL, 5) - self.outputArea = wx.TextCtrl(self, style=wx.TE_MULTILINE | wx.TE_READONLY) - mainSizer.Add(self.outputArea, 1, wx.EXPAND | wx.ALL, 5) - - self.should_clean = config.conf["VisionAssistant"]["clean_markdown_chat"] - display_text = clean_markdown(initial_text) if self.should_clean else initial_text - if display_text: - # Translators: Format for displaying AI message in a chat dialog - init_msg = _("AI: {text}\n").format(text=display_text) - self.outputArea.AppendText(init_msg) - if config.conf["VisionAssistant"]["copy_to_clipboard"]: - api.copyToClip(raw_content if raw_content else display_text) - - if not (extra_info and extra_info.get('skip_init_history')): - self.chat_history.append({"role": "model", "parts": [{"text": initial_text}]}) - - self.inputArea = None - if allow_questions: - # Translators: Label for user input field in a chat dialog - ask_text = _("Ask:") - inputLbl = wx.StaticText(self, label=ask_text) - mainSizer.Add(inputLbl, 0, wx.ALL, 5) - self.inputArea = wx.TextCtrl(self, style=wx.TE_PROCESS_ENTER, size=(-1, 30)) - mainSizer.Add(self.inputArea, 0, wx.EXPAND | wx.ALL, 5) - - btnSizer = wx.BoxSizer(wx.HORIZONTAL) - self.askBtn = None - if allow_questions: - # Translators: Button to send message in a chat dialog - self.askBtn = wx.Button(self, label=_("Send")) - # Translators: Button to view the content in a formatted HTML window - self.viewBtn = wx.Button(self, label=_("View Formatted")) - self.viewBtn.Bind(wx.EVT_BUTTON, self.onView) - # Translators: Button to save only the result content without chat history - self.saveContentBtn = wx.Button(self, label=_("Save Content")) - self.saveContentBtn.Bind(wx.EVT_BUTTON, self.onSaveContent) - # Translators: Button to save chat in a chat dialog - self.saveBtn = wx.Button(self, label=_("Save Chat")) - # Translators: Button to close chat dialog - self.closeBtn = wx.Button(self, wx.ID_CANCEL, label=_("Close")) - - self.saveBtn.Enable(bool(initial_text.strip())) - self.viewBtn.Enable(bool(self.raw_content)) - self.saveContentBtn.Enable(bool(self.raw_content)) - - if self.askBtn: - btnSizer.Add(self.askBtn, 0, wx.ALL, 5) - btnSizer.Add(self.viewBtn, 0, wx.ALL, 5) - btnSizer.Add(self.saveContentBtn, 0, wx.ALL, 5) - btnSizer.Add(self.saveBtn, 0, wx.ALL, 5) - btnSizer.Add(self.closeBtn, 0, wx.ALL, 5) - mainSizer.Add(btnSizer, 0, wx.ALIGN_RIGHT) - - self.SetSizer(mainSizer) - if self.inputArea: - self.inputArea.SetFocus() - else: - self.outputArea.SetFocus() - if self.askBtn: - self.askBtn.Bind(wx.EVT_BUTTON, self.onAsk) - self.saveBtn.Bind(wx.EVT_BUTTON, self.onSave) - if self.inputArea: - self.inputArea.Bind(wx.EVT_TEXT_ENTER, self.onAsk) - if display_text and self.announce_on_open: - wx.CallLater(300, ui.message, display_text) - - def onAsk(self, event): - if not self.inputArea: - return - question = self.inputArea.Value - if not question.strip(): return - # Translators: Format for displaying User message in a chat dialog - user_msg = _("\nYou: {text}\n").format(text=question) - self.outputArea.AppendText(user_msg) - self.inputArea.Clear() - # Translators: Message shown while processing in a chat dialog - msg = _("Thinking...") - ui.message(msg) - threading.Thread(target=self.process_question, args=(question,), daemon=True).start() - - def process_question(self, question): - result_tuple = self.callback_fn(self.context_data, question, self.chat_history, self.extra_info) - response_text, _ = result_tuple - if response_text: - if not (self.extra_info and self.extra_info.get('file_context')): - self.chat_history.append({"role": "user", "parts": [{"text": question}]}) - self.chat_history.append({"role": "model", "parts": [{"text": response_text}]}) - final_text = clean_markdown(response_text) if self.should_clean else response_text - wx.CallAfter(self.update_response, final_text, response_text) - - def update_response(self, display_text, raw_text=None): - if raw_text: - self.raw_content = raw_text - self.viewBtn.Enable(True) - self.saveContentBtn.Enable(True) - # Translators: Format for displaying AI message in a chat dialog - ai_msg = _("AI: {text}\n").format(text=display_text) - self.outputArea.AppendText(ai_msg) - self.saveBtn.Enable(True) - if config.conf["VisionAssistant"]["copy_to_clipboard"]: - api.copyToClip(raw_text if raw_text else display_text) - self.outputArea.ShowPosition(self.outputArea.GetLastPosition()) - ui.message(display_text) - - def report_save(self, msg): - if self.status_callback: self.status_callback(msg) - else: ui.message(msg) - - def onView(self, event): - full_html = "" - # Translators: Format for displaying User message in a chat dialog - user_label = _("\nYou: {text}\n").format(text="").strip() - # Translators: Format for displaying AI message in a chat dialog - ai_label = _("AI: {text}\n").format(text="").strip() - - if self.chat_history: - for item in self.chat_history: - role = item.get("role", "") - text = item.get("parts", [{}])[0].get("text", "") - if role == "user": - safe_text = text.replace("&", "&").replace("<", "<").replace(">", ">") - full_html += f"

{user_label}

{safe_text}

" - elif role == "model": - formatted_text = markdown_to_html(text, full_page=False) - full_html += f"

{ai_label}

{formatted_text}
" - - if not full_html and self.raw_content: - formatted_text = markdown_to_html(self.raw_content, full_page=False) - full_html += f"

{ai_label}

{formatted_text}" - - if not full_html: return - try: - # Translators: Title of the formatted result window - ui.browseableMessage(full_html, _("Formatted Conversation"), isHtml=True) - except Exception as e: - # Translators: Error message if viewing fails - msg = _("Error displaying content: {error}").format(error=e) - show_error_dialog(msg) - - def onSave(self, event): - # Translators: Save dialog title - path = get_file_path(_("Save Chat Log"), "Text files (*.txt)|*.txt", mode="save") - if path: - try: - with open(path, "w", encoding="utf-8") as f: f.write(self.outputArea.GetValue()) - # Translators: Message shown on successful save of a file. - self.report_save(_("Saved.")) - except Exception as e: - # Translators: Message in the error dialog when saving fails. - msg = _("Save failed: {error}").format(error=e) - show_error_dialog(msg) - - def onSaveContent(self, event): - # Translators: Save dialog title - path = get_file_path(_("Save Result"), "HTML files (*.html)|*.html", mode="save") - if path: - try: - full_html = markdown_to_html(self.raw_content, full_page=True) - with open(path, "w", encoding="utf-8") as f: f.write(full_html) - # Translators: Message on successful save - self.report_save(_("Saved.")) - except Exception as e: - # Translators: Message in the error dialog when saving fails. - msg = _("Save failed: {error}").format(error=e) - show_error_dialog(msg) - -class SettingsPanel(gui.settingsDialogs.SettingsPanel): - title = ADDON_NAME - def makeSettings(self, settingsSizer): - # --- Connection Group --- - # Translators: Title of the settings group for connection and updates - groupLabel = _("Connection") - self.connectionBox = wx.StaticBox(self, label=groupLabel) - connectionSizer = wx.StaticBoxSizer(self.connectionBox, wx.VERTICAL) - cHelper = gui.guiHelper.BoxSizerHelper(self.connectionBox, sizer=connectionSizer) - - # Translators: Label for API Key input - apiLabel = wx.StaticText(self.connectionBox, label=_("Gemini API Key (Separate multiple keys with comma or newline):")) - cHelper.addItem(apiLabel) - - api_value = config.conf["VisionAssistant"]["api_key"] - - self.apiKeyCtrl_hidden = wx.TextCtrl(self.connectionBox, value=api_value, style=wx.TE_PASSWORD, size=(-1, -1)) - - self.apiKeyCtrl_visible = wx.TextCtrl(self.connectionBox, value=api_value, style=wx.TE_MULTILINE | wx.TE_DONTWRAP, size=(-1, 60)) - self.apiKeyCtrl_visible.Hide() - - cHelper.addItem(self.apiKeyCtrl_hidden) - cHelper.addItem(self.apiKeyCtrl_visible) - - # Translators: Checkbox to toggle API Key visibility - self.showApiCheck = wx.CheckBox(self.connectionBox, label=_("Show API Key")) - self.showApiCheck.Bind(wx.EVT_CHECKBOX, self.onToggleApiVisibility) - cHelper.addItem(self.showApiCheck) - - model_display_names = [opt[0] for opt in MODELS] - # Translators: Label for Model selection - self.model = cHelper.addLabeledControl(_("AI Model:"), wx.Choice, choices=model_display_names) - current_id = config.conf["VisionAssistant"]["model_name"] - try: - index = next(i for i, v in enumerate(MODELS) if v[1] == current_id) - self.model.SetSelection(index) - except StopIteration: self.model.SetSelection(0) - - # Translators: Label for Proxy URL input - self.proxyUrl = cHelper.addLabeledControl(_("Proxy URL:"), wx.TextCtrl) - self.proxyUrl.Value = config.conf["VisionAssistant"]["proxy_url"] - - # Translators: Checkbox to enable/disable automatic update checks on NVDA startup - self.checkUpdateStartup = cHelper.addItem(wx.CheckBox(self.connectionBox, label=_("Check for updates on startup"))) - self.checkUpdateStartup.Value = config.conf["VisionAssistant"]["check_update_startup"] - # Translators: Checkbox to toggle markdown cleaning in chat windows - self.cleanMarkdown = cHelper.addItem(wx.CheckBox(self.connectionBox, label=_("Clean Markdown in Chat"))) - self.cleanMarkdown.Value = config.conf["VisionAssistant"]["clean_markdown_chat"] - # Translators: Checkbox to enable copying AI responses to clipboard - self.copyToClipboard = cHelper.addItem(wx.CheckBox(self.connectionBox, label=_("Copy AI responses to clipboard"))) - self.copyToClipboard.Value = config.conf["VisionAssistant"]["copy_to_clipboard"] - # Translators: Checkbox to skip chat window and only speak AI responses - self.skipChatDialog = cHelper.addItem(wx.CheckBox(self.connectionBox, label=_("Direct Output (No Chat Window)"))) - self.skipChatDialog.Value = config.conf["VisionAssistant"]["skip_chat_dialog"] - settingsSizer.Add(connectionSizer, 0, wx.EXPAND | wx.ALL, 5) - - # --- Translation Languages Group --- - # Translators: Title of the settings group for translation languages configuration - groupLabel = _("Translation Languages") - langBox = wx.StaticBox(self, label=groupLabel) - langSizer = wx.StaticBoxSizer(langBox, wx.VERTICAL) - lHelper = gui.guiHelper.BoxSizerHelper(langBox, sizer=langSizer) - - # Translators: Label for Source Language selection - self.sourceLang = lHelper.addLabeledControl(_("Source:"), wx.Choice, choices=SOURCE_NAMES) - try: self.sourceLang.SetSelection(SOURCE_NAMES.index(config.conf["VisionAssistant"]["source_language"])) - except: self.sourceLang.SetSelection(0) - - # Translators: Label for Target Language selection - self.targetLang = lHelper.addLabeledControl(_("Target:"), wx.Choice, choices=TARGET_NAMES) - try: self.targetLang.SetSelection(TARGET_NAMES.index(config.conf["VisionAssistant"]["target_language"])) - except: self.targetLang.SetSelection(0) - - # Translators: Label for AI Response Language selection - self.aiResponseLang = lHelper.addLabeledControl(_("AI Response:"), wx.Choice, choices=TARGET_NAMES) - try: self.aiResponseLang.SetSelection(TARGET_NAMES.index(config.conf["VisionAssistant"]["ai_response_language"])) - except: self.aiResponseLang.SetSelection(0) - - # Translators: Checkbox for Smart Swap feature - self.smartSwap = lHelper.addItem(wx.CheckBox(langBox, label=_("Smart Swap"))) - self.smartSwap.Value = config.conf["VisionAssistant"]["smart_swap"] - settingsSizer.Add(langSizer, 0, wx.EXPAND | wx.ALL, 5) - - # --- Document Reader Settings --- - # Translators: Title of settings group for Document Reader features - groupLabel = _("Document Reader") - docBox = wx.StaticBox(self, label=groupLabel) - docSizer = wx.StaticBoxSizer(docBox, wx.VERTICAL) - dHelper = gui.guiHelper.BoxSizerHelper(docBox, sizer=docSizer) - - # Translators: Label for OCR Engine selection - self.ocr_sel = dHelper.addLabeledControl(_("OCR Engine:"), wx.Choice, choices=[x[0] for x in OCR_ENGINES]) - curr_ocr = config.conf["VisionAssistant"]["ocr_engine"] - try: - o_idx = next(i for i, v in enumerate(OCR_ENGINES) if v[1] == curr_ocr) - self.ocr_sel.SetSelection(o_idx) - except: self.ocr_sel.SetSelection(0) - - voice_choices = [f"{v[0]} - {v[1]}" for v in GEMINI_VOICES] - # Translators: Label for TTS Voice selection - self.voice_sel = dHelper.addLabeledControl(_("TTS Voice:"), wx.Choice, choices=voice_choices) - curr_voice = config.conf["VisionAssistant"]["tts_voice"] - try: - v_idx = next(i for i, v in enumerate(GEMINI_VOICES) if v[0] == curr_voice) - self.voice_sel.SetSelection(v_idx) - except: self.voice_sel.SetSelection(1) - settingsSizer.Add(docSizer, 0, wx.EXPAND | wx.ALL, 5) - - # --- CAPTCHA Group --- - # Translators: Title of the settings group for CAPTCHA options - groupLabel = _("CAPTCHA") - capBox = wx.StaticBox(self, label=groupLabel) - capSizer = wx.StaticBoxSizer(capBox, wx.VERTICAL) - capHelper = gui.guiHelper.BoxSizerHelper(capBox, sizer=capSizer) - # Translators: Label for CAPTCHA capture method selection - self.captchaMode = capHelper.addLabeledControl(_("Capture Method:"), wx.Choice, choices=[ - # Translators: A choice for capture method. Captures only the specific object under the NVDA navigator cursor. - _("Navigator Object"), - # Translators: A choice for capture method. Captures the entire visible screen area. - _("Full Screen") - ]) - self.captchaMode.SetSelection(0 if config.conf["VisionAssistant"]["captcha_mode"] == 'navigator' else 1) - settingsSizer.Add(capSizer, 0, wx.EXPAND | wx.ALL, 5) - - self.defaultPromptItems = get_configured_default_prompts() - self.customPromptItems = load_configured_custom_prompts() - - # --- Prompt Manager Group --- - # Translators: Title of the settings group for prompt management - groupLabel = _("Prompts") - promptsBox = wx.StaticBox(self, label=groupLabel) - promptsSizer = wx.StaticBoxSizer(promptsBox, wx.VERTICAL) - pHelper = gui.guiHelper.BoxSizerHelper(promptsBox, sizer=promptsSizer) - # Translators: Description for the prompt manager button. - pHelper.addItem(wx.StaticText(promptsBox, label=_("Manage default and custom prompts."))) - # Translators: Button label to open prompt manager dialog. - self.managePromptsBtn = wx.Button(promptsBox, label=_("Manage Prompts...")) - self.managePromptsBtn.Bind(wx.EVT_BUTTON, self.onManagePrompts) - pHelper.addItem(self.managePromptsBtn) - self.promptsSummary = wx.StaticText(promptsBox) - pHelper.addItem(self.promptsSummary) - self._refreshPromptSummary() - settingsSizer.Add(promptsSizer, 0, wx.EXPAND | wx.ALL, 5) - - def _refreshPromptSummary(self): - # Translators: Summary text for prompt counts in settings. - summary = _("Default prompts: {defaultCount}, Custom prompts: {customCount}").format( - defaultCount=len(self.defaultPromptItems), - customCount=len(self.customPromptItems), - ) - self.promptsSummary.SetLabel(summary) - - def onManagePrompts(self, event): - top = wx.GetTopLevelParent(self) - dlg = PromptManagerDialog( - self, - self.defaultPromptItems, - self.customPromptItems, - PROMPT_VARIABLES_GUIDE, - ) - try: - if dlg.ShowModal() == wx.ID_OK: - self.defaultPromptItems = dlg.get_default_items() - self.customPromptItems = dlg.get_custom_items() - self._refreshPromptSummary() - finally: - dlg.Destroy() - if top: - top.Enable(True) - top.SetFocus() - - def onToggleApiVisibility(self, event): - if self.showApiCheck.IsChecked(): - self.apiKeyCtrl_visible.SetValue(self.apiKeyCtrl_hidden.GetValue()) - self.apiKeyCtrl_hidden.Hide() - self.apiKeyCtrl_visible.Show() - else: - self.apiKeyCtrl_hidden.SetValue(self.apiKeyCtrl_visible.GetValue()) - self.apiKeyCtrl_visible.Hide() - self.apiKeyCtrl_hidden.Show() - - self.connectionBox.GetParent().Layout() - - def onSave(self): - val = self.apiKeyCtrl_visible.GetValue() if self.showApiCheck.IsChecked() else self.apiKeyCtrl_hidden.GetValue() - config.conf["VisionAssistant"]["api_key"] = val.strip() - config.conf["VisionAssistant"]["model_name"] = MODELS[self.model.GetSelection()][1] - config.conf["VisionAssistant"]["proxy_url"] = self.proxyUrl.Value.strip() - config.conf["VisionAssistant"]["source_language"] = SOURCE_NAMES[self.sourceLang.GetSelection()] - config.conf["VisionAssistant"]["target_language"] = TARGET_NAMES[self.targetLang.GetSelection()] - config.conf["VisionAssistant"]["ai_response_language"] = TARGET_NAMES[self.aiResponseLang.GetSelection()] - config.conf["VisionAssistant"]["smart_swap"] = self.smartSwap.Value - config.conf["VisionAssistant"]["check_update_startup"] = self.checkUpdateStartup.Value - config.conf["VisionAssistant"]["clean_markdown_chat"] = self.cleanMarkdown.Value - config.conf["VisionAssistant"]["copy_to_clipboard"] = self.copyToClipboard.Value - config.conf["VisionAssistant"]["skip_chat_dialog"] = self.skipChatDialog.Value - config.conf["VisionAssistant"]["captcha_mode"] = 'navigator' if self.captchaMode.GetSelection() == 0 else 'fullscreen' - config.conf["VisionAssistant"]["custom_prompts_v2"] = serialize_custom_prompts_v2(self.customPromptItems) - config.conf["VisionAssistant"]["custom_prompts"] = "" - config.conf["VisionAssistant"]["default_refine_prompts"] = serialize_default_prompt_overrides(self.defaultPromptItems) - config.conf["VisionAssistant"]["ocr_engine"] = OCR_ENGINES[self.ocr_sel.GetSelection()][1] - config.conf["VisionAssistant"]["tts_voice"] = GEMINI_VOICES[self.voice_sel.GetSelection()][0] - -class RangeDialog(wx.Dialog): - def __init__(self, parent, total_pages): - # Translators: Title of the PDF options dialog - super().__init__(parent, title=_("Options"), size=(350, 320)) - sizer = wx.BoxSizer(wx.VERTICAL) - # Translators: Label showing total pages found - sizer.Add(wx.StaticText(self, label=_("Total Pages (All Files): {count}").format(count=total_pages)), 0, wx.ALL, 10) - - # Translators: Box title for page range selection - box_range = wx.StaticBoxSizer(wx.VERTICAL, self, _("Range")) - g_sizer = wx.FlexGridSizer(2, 2, 10, 10) - # Translators: Label for start page - g_sizer.Add(wx.StaticText(self, label=_("From:")), 0, wx.ALIGN_CENTER_VERTICAL) - self.spin_from = wx.SpinCtrl(self, min=1, max=total_pages, initial=1) - g_sizer.Add(self.spin_from, 1, wx.EXPAND) - # Translators: Label for end page - g_sizer.Add(wx.StaticText(self, label=_("To:")), 0, wx.ALIGN_CENTER_VERTICAL) - self.spin_to = wx.SpinCtrl(self, min=1, max=total_pages, initial=total_pages) - g_sizer.Add(self.spin_to, 1, wx.EXPAND) - box_range.Add(g_sizer, 1, wx.EXPAND | wx.ALL, 5) - sizer.Add(box_range, 0, wx.EXPAND | wx.ALL, 10) - - # Translators: Box title for translation options - box_trans = wx.StaticBoxSizer(wx.VERTICAL, self, _("Translation")) - # Translators: Checkbox to enable translation - self.chk_trans = wx.CheckBox(self, label=_("Translate Output")) - box_trans.Add(self.chk_trans, 0, wx.ALL, 5) - h_sizer = wx.BoxSizer(wx.HORIZONTAL) - # Translators: Label for target language - h_sizer.Add(wx.StaticText(self, label=_("Target:")), 0, wx.ALIGN_CENTER_VERTICAL | wx.RIGHT, 5) - self.cmb_lang = wx.Choice(self, choices=TARGET_NAMES) - self.cmb_lang.SetSelection(0) - h_sizer.Add(self.cmb_lang, 1) - box_trans.Add(h_sizer, 1, wx.EXPAND | wx.ALL, 5) - sizer.Add(box_trans, 0, wx.EXPAND | wx.ALL, 10) - - btn_sizer = wx.BoxSizer(wx.HORIZONTAL) - # Translators: Button to start processing - btn_ok = wx.Button(self, wx.ID_OK, label=_("Start")) - btn_ok.SetDefault() - # Translators: Button to cancel - btn_cancel = wx.Button(self, wx.ID_CANCEL, label=_("Cancel")) - btn_sizer.Add(btn_ok, 0, wx.RIGHT, 10) - btn_sizer.Add(btn_cancel, 0) - sizer.Add(btn_sizer, 0, wx.ALIGN_CENTER | wx.ALL, 10) - self.SetSizer(sizer) - - self.chk_trans.Bind(wx.EVT_CHECKBOX, self.on_check) - self.cmb_lang.Disable() - - def on_check(self, event): - self.cmb_lang.Enable(self.chk_trans.IsChecked()) - - def get_settings(self): - return { - 'start': self.spin_from.GetValue() - 1, - 'end': self.spin_to.GetValue() - 1, - 'translate': self.chk_trans.IsChecked(), - 'lang': TARGET_NAMES[self.cmb_lang.GetSelection()] - } - -class ChatDialog(wx.Dialog): - instance = None - - def __init__(self, parent, file_path): - # Translators: Title of the chat dialog - super().__init__(parent, title=_("Ask about Document"), size=(600, 500), style=wx.DEFAULT_DIALOG_STYLE | wx.RESIZE_BORDER) - ChatDialog.instance = self - self.file_path = file_path - self.file_uri = None - self.mime_type = get_mime_type(file_path) - self.history = [] - - sizer = wx.BoxSizer(wx.VERTICAL) - # Translators: Label showing the analyzed file name - lbl_info = wx.StaticText(self, label=_("File: {name}").format(name=os.path.basename(file_path))) - sizer.Add(lbl_info, 0, wx.ALL, 5) - self.display = wx.TextCtrl(self, style=wx.TE_MULTILINE | wx.TE_READONLY | wx.TE_RICH2) - sizer.Add(self.display, 1, wx.EXPAND | wx.ALL, 10) - # Translators: Status message while uploading - self.display.SetValue(_("Uploading to Gemini...\n")) - - input_sizer = wx.BoxSizer(wx.HORIZONTAL) - # Translators: Label for the chat input field - input_sizer.Add(wx.StaticText(self, label=_("Your Question:")), 0, wx.ALIGN_CENTER_VERTICAL | wx.RIGHT, 5) - self.input = wx.TextCtrl(self, style=wx.TE_PROCESS_ENTER, size=(-1, 30)) - self.input.Bind(wx.EVT_TEXT_ENTER, self.on_send) - input_sizer.Add(self.input, 1, wx.EXPAND | wx.RIGHT, 5) - - # Translators: Button to send message - self.btn_send = wx.Button(self, label=_("Send")) - self.btn_send.Bind(wx.EVT_BUTTON, self.on_send) - self.btn_send.Disable() - input_sizer.Add(self.btn_send, 0) - sizer.Add(input_sizer, 0, wx.EXPAND | wx.ALL, 10) - self.SetSizer(sizer) - self.Bind(wx.EVT_CLOSE, self.on_close) - threading.Thread(target=self.init_upload, daemon=True).start() - - def on_close(self, event): - ChatDialog.instance = None - self.Destroy() - - def init_upload(self): - uri = GeminiHandler.upload_for_chat(self.file_path, self.mime_type) - if uri and not str(uri).startswith("ERROR:"): - self.file_uri = uri - wx.CallAfter(self.on_ready) - else: - err_msg = str(uri)[6:] if uri else _("Upload failed.") - wx.CallAfter(show_error_dialog, err_msg) - wx.CallAfter(self.Close) - - def on_ready(self): - # Translators: Message when ready to chat - self.display.AppendText(_("Ready! Ask your questions.\n")) - self.btn_send.Enable() - self.input.SetFocus() - - def on_send(self, event): - msg = self.input.GetValue().strip() - if not msg: return - self.input.Clear() - self.display.AppendText(f"You: {msg}\n") - # Translators: Message showing AI is thinking - ui.message(_("Thinking...")) - threading.Thread(target=self.do_chat, args=(msg,), daemon=True).start() - - def do_chat(self, msg): - resp = GeminiHandler.chat(self.history, msg, self.file_uri, self.mime_type) - - if str(resp).startswith("ERROR:"): - show_error_dialog(resp[6:]) - if _vision_assistant_instance: - # Translators: Initial status when the add-on is doing nothing - _vision_assistant_instance.current_status = _("Idle") - return - - self.history.append({"role": "user", "parts": [{"text": msg}]}) - self.history.append({"role": "model", "parts": [{"text": resp}]}) - wx.CallAfter(self.display.AppendText, f"AI: {resp}\n\n") - # Translators: Spoken prefix for AI response - wx.CallAfter(ui.message, _("AI: ") + resp) - -class DocumentViewerDialog(wx.Dialog): - def __init__(self, parent, virtual_doc, settings): - # Translators: Title of the Document Reader window. - title_text = f"{ADDON_NAME} - {_('Document Reader')}" - super().__init__(parent, title=title_text, size=(800, 600), style=wx.DEFAULT_DIALOG_STYLE | wx.RESIZE_BORDER | wx.MAXIMIZE_BOX) - self.v_doc = virtual_doc - self.start_page = settings['start'] - self.end_page = settings['end'] - self.do_translate = settings['translate'] - self.target_lang = settings['lang'] - self.range_count = self.end_page - self.start_page + 1 - self.page_cache = {} - self.current_page = self.start_page - self.thread_pool = ThreadPoolExecutor(max_workers=5) - - self.init_ui() - self.Centre() - threading.Thread(target=self.start_auto_processing, daemon=True).start() - - def init_ui(self): - panel = wx.Panel(self) - vbox = wx.BoxSizer(wx.VERTICAL) - # Translators: Initial status message - self.lbl_status = wx.StaticText(panel, label=_("Initializing...")) - vbox.Add(self.lbl_status, 0, wx.ALL, 5) - self.txt_content = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY | wx.TE_RICH2) - vbox.Add(self.txt_content, 1, wx.EXPAND | wx.LEFT | wx.RIGHT, 10) - hbox_nav = wx.BoxSizer(wx.HORIZONTAL) - # Translators: Button to go to previous page - self.btn_prev = wx.Button(panel, label=_("Previous (Ctrl+PageUp)")) - self.btn_prev.Bind(wx.EVT_BUTTON, self.on_prev) - hbox_nav.Add(self.btn_prev, 0, wx.RIGHT, 5) - # Translators: Button to go to next page - self.btn_next = wx.Button(panel, label=_("Next (Ctrl+PageDown)")) - self.btn_next.Bind(wx.EVT_BUTTON, self.on_next) - hbox_nav.Add(self.btn_next, 0, wx.RIGHT, 15) - # Translators: Label for Go To Page - hbox_nav.Add(wx.StaticText(panel, label=_("Go to:")), 0, wx.ALIGN_CENTER_VERTICAL | wx.RIGHT, 5) - choices = [str(i+1) for i in range(self.start_page, self.end_page + 1)] - self.cmb_pages = wx.Choice(panel, choices=choices) - self.cmb_pages.Bind(wx.EVT_CHOICE, self.on_page_select) - hbox_nav.Add(self.cmb_pages, 0, wx.RIGHT, 15) - vbox.Add(hbox_nav, 0, wx.ALIGN_CENTER | wx.ALL, 10) - hbox_actions = wx.BoxSizer(wx.HORIZONTAL) - # Translators: Button to Ask questions about the document - self.btn_ask = wx.Button(panel, label=_("Ask AI (Alt+A)")) - self.btn_ask.Bind(wx.EVT_BUTTON, self.on_ask) - hbox_actions.Add(self.btn_ask, 0, wx.RIGHT, 5) - - # Translators: Button to force re-scan - self.btn_gemini = wx.Button(panel, label=_("Re-scan with Gemini (Alt+R)")) - self.btn_gemini.Bind(wx.EVT_BUTTON, self.on_gemini_scan) - hbox_actions.Add(self.btn_gemini, 0, wx.RIGHT, 5) - - # Translators: Button to generate audio - self.btn_tts = wx.Button(panel, label=_("Generate Audio (Alt+G)")) - self.btn_tts.Bind(wx.EVT_BUTTON, self.on_tts) - hbox_actions.Add(self.btn_tts, 0, wx.RIGHT, 5) - - # Translators: Button to view formatted content - self.btn_view = wx.Button(panel, label=_("View Formatted")) - self.btn_view.Bind(wx.EVT_BUTTON, self.on_view) - hbox_actions.Add(self.btn_view, 0, wx.RIGHT, 5) - - # Translators: Button to save text - self.btn_save = wx.Button(panel, label=_("Save (Alt+S)")) - self.btn_save.Bind(wx.EVT_BUTTON, self.on_save_all) - hbox_actions.Add(self.btn_save, 0) - - vbox.Add(hbox_actions, 0, wx.ALIGN_CENTER | wx.ALL, 5) - btn_close = wx.Button(panel, wx.ID_CLOSE, label=_("Close")) - btn_close.Bind(wx.EVT_BUTTON, lambda e: self.Destroy()) - vbox.Add(btn_close, 0, wx.ALIGN_RIGHT | wx.ALL, 10) - panel.SetSizer(vbox) - accel_tbl = wx.AcceleratorTable([ - (wx.ACCEL_CTRL, wx.WXK_PAGEDOWN, self.btn_next.GetId()), - (wx.ACCEL_CTRL, wx.WXK_PAGEUP, self.btn_prev.GetId()), - (wx.ACCEL_CTRL, ord('S'), self.btn_save.GetId()), - (wx.ACCEL_ALT, ord('S'), self.btn_save.GetId()), - (wx.ACCEL_ALT, ord('A'), self.btn_ask.GetId()), - (wx.ACCEL_ALT, ord('R'), self.btn_gemini.GetId()), - (wx.ACCEL_ALT, ord('G'), self.btn_tts.GetId()) - ]) - self.SetAcceleratorTable(accel_tbl) - self.cmb_pages.SetSelection(0) - self.update_view() - self.txt_content.SetFocus() - - def start_auto_processing(self): - engine = config.conf["VisionAssistant"]["ocr_engine"] - - if engine == 'gemini': - threading.Thread(target=self.gemini_scan_batch_thread, daemon=True).start() - else: - for i in range(self.start_page, self.end_page + 1): - self.thread_pool.submit(self.process_page_worker, i) - - def process_page_worker(self, page_num): - if page_num in self.page_cache: return - text = self._get_page_text_logic(page_num) - self.page_cache[page_num] = text - if page_num == self.current_page: - wx.CallAfter(self.update_view) - # Translators: Spoken message when the current page is ready - wx.CallAfter(ui.message, _("Page {num} ready").format(num=page_num + 1)) - - def _get_page_text_logic(self, page_num): - file_path, page_idx = self.v_doc.get_page_info(page_num) - if not file_path: return "" - try: - doc = fitz.open(file_path) - page = doc.load_page(page_idx) - pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) - img_bytes = pix.tobytes("jpg") - doc.close() - engine = config.conf["VisionAssistant"]["ocr_engine"] - text = None - if engine == 'gemini': - try: text = GeminiHandler.ocr_page(img_bytes) - except: text = None - if not text or not text.strip() or engine == 'chrome': - text = ChromeOCREngine.recognize(img_bytes) - if not text or not text.strip(): - text = SmartProgrammersOCREngine.recognize(img_bytes) - if not text or not text.strip(): - # Translators: Placeholder text when OCR fails - text = _("[OCR failed. Try Gemini Re-scan.]") - if self.do_translate and text and "[OCR failed" not in text: - if engine == 'gemini': - text = GeminiHandler.translate(text, self.target_lang) - else: - text = GoogleTranslator.translate(text, self.target_lang) - return text - except: - # Translators: Error message for page processing failure - return _("Error processing page.") - - def update_view(self): - rel_page = self.current_page - self.start_page + 1 - # Translators: Status label format - self.lbl_status.SetLabel(_("Page {current} of {total}").format(current=rel_page, total=self.range_count)) - if self.current_page in self.page_cache: - self.txt_content.SetValue(self.page_cache[self.current_page]) - self.txt_content.SetInsertionPoint(0) - self.txt_content.SetFocus() - else: - # Translators: Status when page is loading - self.txt_content.SetValue(_("Processing in background...")) - self.txt_content.SetInsertionPoint(0) - self.txt_content.SetFocus() - self.btn_prev.Enable(self.current_page > self.start_page) - self.btn_next.Enable(self.current_page < self.end_page) - - def load_page(self, page_num): - if page_num < self.start_page or page_num > self.end_page: return - self.current_page = page_num - self.cmb_pages.SetSelection(page_num - self.start_page) - # Translators: Spoken message when switching pages - ui.message(_("Page {num}").format(num=page_num + 1)) - self.update_view() - - def on_prev(self, event): - if self.current_page > self.start_page: self.load_page(self.current_page - 1) - - def on_next(self, event): - if self.current_page < self.end_page: self.load_page(self.current_page + 1) - - def on_page_select(self, event): - self.load_page(self.start_page + self.cmb_pages.GetSelection()) - - def on_view(self, event): - full_html = [] - for i in range(self.start_page, self.end_page + 1): - if i in self.page_cache: - page_text = self.page_cache[i] - page_content = markdown_to_html(page_text, full_page=False) - # Translators: Heading for each page in the formatted content view. - page_label = _("Page {num}").format(num=i+1) - full_html.append(f"

{page_label}

") - full_html.append(page_content) - full_html.append("
") - - if not full_html: - text = self.txt_content.GetValue() - if not text: return - full_html.append(markdown_to_html(text, full_page=False)) - - combined_html = "".join(full_html) - try: - # Translators: Title of the formatted result window - ui.browseableMessage(combined_html, _("Formatted Content"), isHtml=True) - except Exception as e: - show_error_dialog(str(e)) - - def on_gemini_scan(self, event): - if not config.conf["VisionAssistant"]["api_key"]: - wx.MessageBox(_("Please configure Gemini API Key."), _("Error"), wx.ICON_ERROR) - return - menu = wx.Menu() - # Translators: Menu option for current page - item_curr = menu.Append(wx.ID_ANY, _("Current Page")) - # Translators: Menu option for all pages - item_all = menu.Append(wx.ID_ANY, _("All Pages (In Range)")) - self.Bind(wx.EVT_MENU, self.do_rescan_current, item_curr) - self.Bind(wx.EVT_MENU, self.do_rescan_all, item_all) - self.PopupMenu(menu) - menu.Destroy() - - def do_rescan_current(self, event): - if self.current_page in self.page_cache: del self.page_cache[self.current_page] - self.update_view() - # Translators: Message during manual scan - ui.message(_("Scanning with Gemini...")) - threading.Thread(target=self.gemini_scan_single_thread, args=(self.current_page,), daemon=True).start() - - def gemini_scan_single_thread(self, page_num): - try: - file_path, page_idx = self.v_doc.get_page_info(page_num) - doc = fitz.open(file_path) - page = doc.load_page(page_idx) - pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) - text = GeminiHandler.ocr_page(pix.tobytes("jpg")) - doc.close() - if self.do_translate: text = GeminiHandler.translate(text, self.target_lang) - self.page_cache[page_num] = text - if self.current_page == page_num: - wx.CallAfter(self.update_view) - # Translators: Message when scan is complete - wx.CallAfter(ui.message, _("Scan complete")) - except: pass - - def do_rescan_all(self, event): - threading.Thread(target=self.gemini_scan_batch_thread, daemon=True).start() - - def gemini_scan_batch_thread(self): - # Translators: Message when batch scan starts - msg = _("Batch Processing Started") - if _vision_assistant_instance: _vision_assistant_instance.current_status = msg - wx.CallAfter(ui.message, msg) - - for i in range(self.start_page, self.end_page + 1): - if i in self.page_cache: del self.page_cache[i] - wx.CallAfter(self.update_view) - - upload_path = self.v_doc.create_merged_pdf(self.start_page, self.end_page) - if not upload_path: - # Translators: Error message if PDF creation fails - wx.CallAfter(self.lbl_status.SetLabel, _("Error creating temporary PDF.")) - return - - try: - count = (self.end_page - self.start_page) + 1 - results = GeminiHandler.upload_and_process_batch(upload_path, "application/pdf", count) - - if not results or (len(results) == 1 and str(results[0]).startswith("ERROR:")): - err_msg = results[0][6:] if results else _("Unknown error") - # Translators: Message reported when batch scan fails - error_text = _("Scan failed: {err}").format(err=err_msg) - for i in range(self.start_page, self.end_page + 1): - self.page_cache[i] = error_text - - wx.CallAfter(self.update_view) - wx.CallAfter(ui.message, error_text) - return - - for i, text_part in enumerate(results): - if i >= count: break - idx = self.start_page + i - clean = text_part.strip() - if self.do_translate: - clean = GeminiHandler.translate(clean, self.target_lang) - self.page_cache[idx] = clean - - wx.CallAfter(self.update_view) - # Translators: Message when batch scan is complete - final_msg = _("Batch Scan Complete") - if _vision_assistant_instance: - # Translators: Initial status when the add-on is doing nothing - _vision_assistant_instance.current_status = _("Idle") - wx.CallAfter(ui.message, final_msg) - finally: - if upload_path and os.path.exists(upload_path): - try: os.remove(upload_path) - except: pass - - def on_tts(self, event): - if not config.conf["VisionAssistant"]["api_key"]: - wx.MessageBox(_("Please configure Gemini API Key."), _("Error"), wx.ICON_ERROR) - return - menu = wx.Menu() - # Translators: Menu option for TTS current page - item_curr = menu.Append(wx.ID_ANY, _("Generate for Current Page")) - # Translators: Menu option for TTS all pages - item_all = menu.Append(wx.ID_ANY, _("Generate for All Pages (In Range)")) - self.Bind(wx.EVT_MENU, self.do_tts_current, item_curr) - self.Bind(wx.EVT_MENU, self.do_tts_all, item_all) - self.PopupMenu(menu) - menu.Destroy() - - def do_tts_current(self, event): - text = self.txt_content.GetValue().strip() - if not text: - # Translators: Error message when text field is empty - wx.MessageBox(_("No text to read."), "Error") - return - self._save_tts(text) - - def do_tts_all(self, event): - threading.Thread(target=self.tts_batch_thread, daemon=True).start() - - def tts_batch_thread(self): - full_text = [] - # Translators: Message while gathering text - wx.CallAfter(ui.message, _("Gathering text for audio...")) - for i in range(self.start_page, self.end_page + 1): - while i not in self.page_cache: time.sleep(0.1) - full_text.append(self.page_cache[i]) - final_text = "\n".join(full_text).strip() - if not final_text: return - wx.CallAfter(self._save_tts, final_text) - - def _save_tts(self, text): - # Translators: File dialog title for saving audio - path = get_file_path(_("Save Audio"), "MP3 Files (*.mp3)|*.mp3|WAV Files (*.wav)|*.wav", mode="save") - if path: - voice = config.conf["VisionAssistant"]["tts_voice"] - threading.Thread(target=self.tts_worker, args=(text, voice, path), daemon=True).start() - - def tts_worker(self, text, voice, path): - # Translators: Message while generating audio - msg = _("Generating Audio...") - if _vision_assistant_instance: _vision_assistant_instance.current_status = msg - wx.CallAfter(ui.message, msg) - try: - audio_b64 = GeminiHandler.generate_speech(text, voice) - if not audio_b64 or len(audio_b64) < 100: - wx.CallAfter(wx.MessageBox, f"TTS Error: {audio_b64}", "Error", wx.ICON_ERROR) - return - missing_padding = len(audio_b64) % 4 - if missing_padding: audio_b64 += '=' * (4 - missing_padding) - pcm_data = base64.b64decode(audio_b64) - - if path.lower().endswith(".mp3"): - import subprocess - lame_path = os.path.join(os.path.dirname(__file__), "lib", "lame.exe") - if not os.path.exists(lame_path): - wx.CallAfter(wx.MessageBox, _("lame.exe not found in lib folder."), "Error", wx.ICON_ERROR) - return - - process = subprocess.Popen( - [lame_path, "-r", "-s", "24", "-m", "m", "-b", "128", "--bitwidth", "16", "--resample", "24", "-q", "0", "-", path], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - creationflags=getattr(subprocess, 'CREATE_NO_WINDOW', 0) - ) - - process.communicate(input=pcm_data) - else: - with wave.open(path, "wb") as wf: - wf.setnchannels(1) - wf.setsampwidth(2) - wf.setframerate(24000) - wf.writeframes(pcm_data) - - # Translators: Spoken message when audio is saved - res_msg = _("Audio Saved") - if _vision_assistant_instance: _vision_assistant_instance.current_status = _("Idle") - wx.CallAfter(ui.message, res_msg) - wx.CallAfter(wx.MessageBox, _("Audio file generated and saved successfully."), _("Success"), wx.OK | wx.ICON_INFORMATION) - except Exception as e: - if _vision_assistant_instance: _vision_assistant_instance.current_status = _("Idle") - wx.CallAfter(wx.MessageBox, f"TTS Error: {e}", "Error", wx.ICON_ERROR) - - def on_ask(self, event): - if not config.conf["VisionAssistant"]["api_key"]: - wx.MessageBox(_("Please configure Gemini API Key."), _("Error"), wx.ICON_ERROR) - return - if ChatDialog.instance: - ChatDialog.instance.Raise() - ChatDialog.instance.SetFocus() - return - file_path, _ = self.v_doc.get_page_info(self.current_page) - if file_path: - dlg = ChatDialog(self, file_path) - dlg.Show() - - def on_save_all(self, event): - # Translators: File dialog filter for saving text/html - wildcard = "Text File (*.txt)|*.txt|HTML File (*.html)|*.html" - # Translators: File dialog title for saving - path = get_file_path(_("Save"), wildcard, mode="save") - if path: - is_html = path.lower().endswith('.html') - self.btn_save.Disable() - threading.Thread(target=self.save_thread, args=(path, is_html), daemon=True).start() - - def save_thread(self, path, is_html): - full_content = [] - try: - for i in range(self.start_page, self.end_page + 1): - # Translators: Message showing save progress - wx.CallAfter(self.lbl_status.SetLabel, _("Saving Page {num}...").format(num=i+1)) - while i not in self.page_cache: time.sleep(0.1) - txt = self.page_cache[i] - if is_html: - h = markdown_to_html(txt) - if "" in h: h = h.split("")[1].split("")[0] - full_content.append(f"

Page {i+1}

{h}") - else: - full_content.append(f"--- Page {i+1} ---\n{txt}\n") - with open(path, "w", encoding="utf-8") as f: - if is_html: f.write(f"{''.join(full_content)}") - else: f.write("\n".join(full_content)) - # Translators: Status label when save is complete - wx.CallAfter(self.lbl_status.SetLabel, _("Saved")) - # Translators: Message box content for successful save - wx.CallAfter(wx.MessageBox, _("File saved successfully."), _("Success"), wx.OK | wx.ICON_INFORMATION) - except Exception as e: - wx.CallAfter(wx.MessageBox, f"Save Error: {e}", "Error", wx.ICON_ERROR) - finally: wx.CallAfter(self.btn_save.Enable) +from .updater import UpdateDialog, UpdateManager + +from .dialogs import ( + ChatDialog, + DocumentViewerDialog, + RangeDialog, + SettingsPanel, + VisionQADialog, + set_vision_assistant_instance, +) class GlobalPlugin(globalPluginHandler.GlobalPlugin): scriptCategory = ADDON_NAME @@ -2379,8 +134,7 @@ class GlobalPlugin(globalPluginHandler.GlobalPlugin): def __init__(self): super(GlobalPlugin, self).__init__() - global _vision_assistant_instance - _vision_assistant_instance = self + set_vision_assistant_instance(self) try: migrate_prompt_config_if_needed() except Exception as e: @@ -2503,7 +257,6 @@ def script_activateLayer(self, gesture): tones.beep(500, 100) def terminate(self): - global _vision_assistant_instance try: if hasattr(self, 'va_submenu_item') and self.va_submenu_item: self.tools_menu.Remove(self.va_submenu_item.GetId()) @@ -2529,7 +282,7 @@ def terminate(self): self.translation_cache = {} self._last_source_text = None - _vision_assistant_instance = None + set_vision_assistant_instance(None) gc.collect() def report_status(self, msg): diff --git a/addon/globalPlugins/visionAssistant/constants.py b/addon/globalPlugins/visionAssistant/constants.py new file mode 100644 index 0000000..bcd8ac2 --- /dev/null +++ b/addon/globalPlugins/visionAssistant/constants.py @@ -0,0 +1,393 @@ +# -*- coding: utf-8 -*- + +import addonHandler +import config + +addonHandler.initTranslation() + +ADDON_NAME = addonHandler.getCodeAddon().manifest["summary"] +GITHUB_REPO = "mahmoodhozhabri/VisionAssistantPro" + +CHROME_OCR_KEYS = [ + "AIzaSyA2KlwBX3mkFo30om9LUFYQhpqLoa_BNhE", + "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw" +] + +MODELS = [ + # --- 1. Recommended (Auto-Updating) --- + # Translators: AI Model info. [Auto] = Automatic updates. (Latest) = Newest version. + (_("[Auto]") + " Gemini Flash " + _("(Latest)"), "gemini-flash-latest"), + (_("[Auto]") + " Gemini Flash Lite " + _("(Latest)"), "gemini-flash-lite-latest"), + + # --- 2. Current Standard (Free & Fast) --- + # Translators: AI Model info. [Free] = Generous usage limits. (Preview) = Experimental or early-access version. + (_("[Free]") + " Gemini 3.0 Flash " + _("(Preview)"), "gemini-3-flash-preview"), + (_("[Free]") + " Gemini 2.5 Flash", "gemini-2.5-flash"), + (_("[Free]") + " Gemini 2.5 Flash Lite", "gemini-2.5-flash-lite"), + + # --- 3. High Intelligence (Paid/Pro/Preview) --- + # Translators: AI Model info. [Pro] = High intelligence/Paid tier. (Preview) = Experimental version. + (_("[Pro]") + " Gemini 3.0 Pro " + _("(Preview)"), "gemini-3-pro-preview"), + (_("[Pro]") + " Gemini 2.5 Pro", "gemini-2.5-pro"), +] + +GEMINI_VOICES = [ + # Translators: Adjective describing a bright AI voice style. + ("Zephyr", _("Bright")), + # Translators: Adjective describing an upbeat AI voice style. + ("Puck", _("Upbeat")), + # Translators: Adjective describing an informative AI voice style. + ("Charon", _("Informative")), + # Translators: Adjective describing a firm AI voice style. + ("Kore", _("Firm")), + # Translators: Adjective describing an excitable AI voice style. + ("Fenrir", _("Excitable")), + # Translators: Adjective describing a youthful AI voice style. + ("Leda", _("Youthful")), + # Translators: Adjective describing a firm AI voice style. + ("Orus", _("Firm")), + # Translators: Adjective describing a breezy AI voice style. + ("Aoede", _("Breezy")), + # Translators: Adjective describing an easy-going AI voice style. + ("Callirrhoe", _("Easy-going")), + # Translators: Adjective describing a bright AI voice style. + ("Autonoe", _("Bright")), + # Translators: Adjective describing a breathy AI voice style. + ("Enceladus", _("Breathy")), + # Translators: Adjective describing a clear AI voice style. + ("Iapetus", _("Clear")), + # Translators: Adjective describing an easy-going AI voice style. + ("Umbriel", _("Easy-going")), + # Translators: Adjective describing a smooth AI voice style. + ("Algieba", _("Smooth")), + # Translators: Adjective describing a smooth AI voice style. + ("Despina", _("Smooth")), + # Translators: Adjective describing a clear AI voice style. + ("Erinome", _("Clear")), + # Translators: Adjective describing a gravelly AI voice style. + ("Algenib", _("Gravelly")), + # Translators: Adjective describing an informative AI voice style. + ("Rasalgethi", _("Informative")), + # Translators: Adjective describing an upbeat AI voice style. + ("Laomedeia", _("Upbeat")), + # Translators: Adjective describing a soft AI voice style. + ("Achernar", _("Soft")), + # Translators: Adjective describing a firm AI voice style. + ("Alnilam", _("Firm")), + # Translators: Adjective describing an even AI voice style. + ("Schedar", _("Even")), + # Translators: Adjective describing a mature AI voice style. + ("Gacrux", _("Mature")), + # Translators: Adjective describing a forward AI voice style. + ("Pulcherrima", _("Forward")), + # Translators: Adjective describing a friendly AI voice style. + ("Achird", _("Friendly")), + # Translators: Adjective describing a casual AI voice style. + ("Zubenelgenubi", _("Casual")), + # Translators: Adjective describing a gentle AI voice style. + ("Vindemiatrix", _("Gentle")), + # Translators: Adjective describing a lively AI voice style. + ("Sadachbia", _("Lively")), + # Translators: Adjective describing a knowledgeable AI voice style. + ("Sadaltager", _("Knowledgeable")), + # Translators: Adjective describing a warm AI voice style. + ("Sulafat", _("Warm")) +] + +BASE_LANGUAGES = [ + ("Arabic", "ar"), ("Bulgarian", "bg"), ("Chinese", "zh"), ("Czech", "cs"), ("Danish", "da"), + ("Dutch", "nl"), ("English", "en"), ("Finnish", "fi"), ("French", "fr"), + ("German", "de"), ("Greek", "el"), ("Hebrew", "he"), ("Hindi", "hi"), + ("Hungarian", "hu"), ("Indonesian", "id"), ("Italian", "it"), ("Japanese", "ja"), + ("Korean", "ko"), ("Nepali", "ne"), ("Norwegian", "no"), ("Persian", "fa"), ("Polish", "pl"), + ("Portuguese", "pt"), ("Romanian", "ro"), ("Russian", "ru"), ("Spanish", "es"), + ("Swedish", "sv"), ("Thai", "th"), ("Turkish", "tr"), ("Ukrainian", "uk"), + ("Vietnamese", "vi") +] +SOURCE_LIST = [("Auto-detect", "auto")] + BASE_LANGUAGES +SOURCE_NAMES = [x[0] for x in SOURCE_LIST] +TARGET_LIST = BASE_LANGUAGES +TARGET_NAMES = [x[0] for x in TARGET_LIST] +TARGET_CODES = {x[0]: x[1] for x in BASE_LANGUAGES} + +OCR_ENGINES = [ + # Translators: OCR Engine option (Fast but less formatted) + (_("Chrome (Fast)"), "chrome"), + # Translators: OCR Engine option (Slower but better formatting) + (_("Gemini (Formatted)"), "gemini") +] + +confspec = { + "proxy_url": "string(default='')", + "api_key": "string(default='')", + "model_name": "string(default='gemini-flash-lite-latest')", + "target_language": "string(default='English')", + "source_language": "string(default='Auto-detect')", + "ai_response_language": "string(default='English')", + "smart_swap": "boolean(default=True)", + "captcha_mode": "string(default='navigator')", + "custom_prompts": "string(default='')", + "custom_prompts_v2": "string(default='')", + "default_refine_prompts": "string(default='')", + "check_update_startup": "boolean(default=False)", + "clean_markdown_chat": "boolean(default=True)", + "copy_to_clipboard": "boolean(default=False)", + "skip_chat_dialog": "boolean(default=False)", + "ocr_engine": "string(default='chrome')", + "tts_voice": "string(default='Puck')" +} + +config.conf.spec["VisionAssistant"] = confspec + +PROMPT_TRANSLATE = """ +Task: Translate the text below to "{target_lang}". + +Configuration: +- Target Language: "{target_lang}" +- Swap Language: "{swap_target}" +- Smart Swap: {smart_swap} + +Rules: +1. DEFAULT: Translate the input strictly to "{target_lang}". +2. MIXED CONTENT: If the text contains mixed languages (e.g., Arabic content with English UI terms like 'Reply', 'From', 'Forwarded'), translate EVERYTHING to "{target_lang}". +3. EXCEPTION: If (and ONLY if) the input is already completely in "{target_lang}" AND "Smart Swap" is True, then translate to "{swap_target}". + +Constraints: +- Output ONLY the translation. +- Do NOT translate actual programming code (Python, C++, etc.) or URLs. +- Translate ALL UI elements, menus, and interface labels. + +Input Text: +{text_content} +""" + +PROMPT_UI_LOCATOR = "Analyze UI (Size: {width}x{height}). Request: '{query}'. Output JSON: {{\"x\": int, \"y\": int, \"found\": bool}}." + +REFINE_PROMPT_KEYS = ("summarize", "fix_grammar", "fix_translate", "explain") + +LEGACY_REFINER_TOKENS = { + "summarize": "[summarize]", + "fix_grammar": "[fix_grammar]", + "fix_translate": "[fix_translate]", + "explain": "[explain]", +} + +DEFAULT_SYSTEM_PROMPTS = ( + { + "key": "summarize", + # Translators: Section header for text refinement prompts in Prompt Manager. + "section": _("Refine"), + # Translators: Label for the text summarization prompt. + "label": _("Summarize"), + "prompt": "Summarize the text below in {response_lang}.", + }, + { + "key": "fix_grammar", + # Translators: Section header for text refinement prompts in Prompt Manager. + "section": _("Refine"), + # Translators: Label for the grammar correction prompt. + "label": _("Fix Grammar"), + "prompt": "Fix grammar in the text below. Output ONLY the fixed text.", + }, + { + "key": "fix_translate", + # Translators: Section header for text refinement prompts in Prompt Manager. + "section": _("Refine"), + # Translators: Label for the grammar correction and translation prompt. + "label": _("Fix Grammar & Translate"), + "prompt": "Fix grammar and translate to {target_lang}.{swap_instruction} Output ONLY the result.", + }, + { + "key": "explain", + # Translators: Section header for text refinement prompts in Prompt Manager. + "section": _("Refine"), + # Translators: Label for the text explanation prompt. + "label": _("Explain"), + "prompt": "Explain the text below in {response_lang}.", + }, + { + "key": "translate_main", + # Translators: Section header for translation-related prompts in Prompt Manager. + "section": _("Translation"), + # Translators: Label for the smart translation prompt. + "label": _("Smart Translation"), + "prompt": PROMPT_TRANSLATE.strip(), + }, + { + "key": "translate_quick", + # Translators: Section header for translation-related prompts in Prompt Manager. + "section": _("Translation"), + # Translators: Label for the quick translation prompt. + "label": _("Quick Translation"), + "prompt": "Translate to {target_lang}. Output ONLY translation.", + }, + { + "key": "document_chat_system", + # Translators: Section header for document-related prompts in Prompt Manager. + "section": _("Document"), + # Translators: Label for the initial context prompt in document chat. + "label": _("Document Chat Context"), + "prompt": "STRICTLY Respond in {response_lang}. Use Markdown formatting. Analyze the attached content to answer.", + }, + { + "key": "document_chat_ack", + # Translators: Section header for advanced/internal prompts in Prompt Manager. + "section": _("Advanced"), + # Translators: Label for the AI's acknowledgement reply in document chat. + "label": _("Document Chat Bootstrap Reply"), + "internal": True, + "prompt": "Context received. Ready for questions.", + }, + { + "key": "vision_navigator_object", + # Translators: Section header for image analysis prompts in Prompt Manager. + "section": _("Vision"), + # Translators: Label for the prompt used to analyze the current navigator object. + "label": _("Navigator Object Analysis"), + "prompt": ( + "Analyze this image. Describe the layout, visible text, and UI elements. " + "Use Markdown formatting (headings, lists) to organize the description. " + "Language: {response_lang}. Ensure the response is strictly in {response_lang}. " + "IMPORTANT: Start directly with the description content. Do not add introductory " + "sentences like 'Here is the analysis' or 'The image shows'." + ), + }, + { + "key": "vision_fullscreen", + # Translators: Section header for image analysis prompts in Prompt Manager. + "section": _("Vision"), + # Translators: Label for the prompt used to analyze the entire screen. + "label": _("Full Screen Analysis"), + "prompt": ( + "Analyze this image. Describe the layout, visible text, and UI elements. " + "Use Markdown formatting (headings, lists) to organize the description. " + "Language: {response_lang}. Ensure the response is strictly in {response_lang}. " + "IMPORTANT: Start directly with the description content. Do not add introductory " + "sentences like 'Here is the analysis' or 'The image shows'." + ), + }, + { + "key": "vision_followup_context", + # Translators: Section header for advanced/internal prompts in Prompt Manager. + "section": _("Advanced"), + # Translators: Label for the follow-up context in image analysis chat. + "label": _("Vision Follow-up Context"), + "internal": True, + "prompt": "Image Context. Target Language: {response_lang}", + }, + { + "key": "vision_followup_suffix", + # Translators: Section header for advanced/internal prompts in Prompt Manager. + "section": _("Advanced"), + # Translators: Label for the rule enforced during image analysis follow-up questions. + "label": _("Vision Follow-up Answer Rule"), + "internal": True, + "prompt": "Answer strictly in {response_lang}", + }, + { + "key": "video_analysis", + # Translators: Section header for video analysis prompts in Prompt Manager. + "section": _("Video"), + # Translators: Label for the video content analysis prompt. + "label": _("Video Analysis"), + "prompt": ( + "Analyze this video. Provide a detailed description of the visual content and a " + "summary of the audio. IMPORTANT: Write the entire response STRICTLY in " + "{response_lang} language." + ), + }, + { + "key": "audio_transcription", + # Translators: Section header for audio-related prompts in Prompt Manager. + "section": _("Audio"), + # Translators: Label for the audio file transcription prompt. + "label": _("Audio Transcription"), + "prompt": "Transcribe this audio in {response_lang}.", + }, + { + "key": "dictation_transcribe", + # Translators: Section header for audio-related prompts in Prompt Manager. + "section": _("Audio"), + # Translators: Label for the smart voice dictation prompt. + "label": _("Smart Dictation"), + "prompt": ( + "Transcribe speech. Use native script. Fix stutters. If there is no speech, silence, " + "or background noise only, write exactly: [[[NOSPEECH]]]" + ), + }, + { + "key": "ocr_image_extract", + # Translators: Section header for OCR-related prompts in Prompt Manager. + "section": _("OCR"), + # Translators: Label for the OCR prompt used for image text extraction. + "label": _("OCR Image Extraction"), + "prompt": ( + "Extract all visible text from this image. Strictly preserve original formatting " + "(headings, lists, tables) using Markdown. Do not output any system messages or " + "code block backticks (```). Output ONLY the raw content." + ), + }, + { + "key": "ocr_document_extract", + # Translators: Section header for OCR-related prompts in Prompt Manager. + "section": _("OCR"), + # Translators: Label for the OCR prompt used for document text extraction. + "label": _("OCR Document Extraction"), + "prompt": ( + "Extract all visible text from this document. Strictly preserve original formatting " + "(headings, lists, tables) using Markdown. You MUST insert the exact delimiter " + "'[[[PAGE_SEP]]]' immediately after the content of every single page. Do not output " + "any system messages or code block backticks (```). Output ONLY the raw content." + ), + }, + { + "key": "ocr_document_translate", + # Translators: Section header for document-related prompts in Prompt Manager. + "section": _("Document"), + # Translators: Label for the combined OCR and translation prompt for documents. + "label": _("Document OCR + Translate"), + "prompt": ( + "Extract all text from this document. Preserve formatting (Markdown). Then translate " + "the content to {target_lang}. Output ONLY the translated content. Do not add " + "explanations." + ), + }, + { + "key": "captcha_solver_base", + # Translators: Section header for CAPTCHA-related prompts in Prompt Manager. + "section": _("CAPTCHA"), + # Translators: Label for the CAPTCHA solving prompt. + "label": _("CAPTCHA Solver"), + "internal": True, + "prompt": ( + "Blind user. Return CAPTCHA code only. If NO CAPTCHA is detected in the image, " + "strictly return: [[[NO_CAPTCHA]]].{captcha_extra}" + ), + }, + { + "key": "refine_files_only", + # Translators: Section header for advanced/internal prompts in Prompt Manager. + "section": _("Advanced"), + # Translators: Label for the fallback prompt when only files are provided in Refine. + "label": _("Refine Files-Only Fallback"), + "internal": True, + "prompt": "Analyze these files.", + }, +) + +PROMPT_VARIABLES_GUIDE = ( + # Translators: Description and input type for the [selection] variable in the Variables Guide. + ("[selection]", _("Currently selected text"), _("Text")), + # Translators: Description for the [clipboard] variable in the Variables Guide. + ("[clipboard]", _("Clipboard content"), _("Text")), + # Translators: Description and input type for the [screen_obj] variable in the Variables Guide. + ("[screen_obj]", _("Screenshot of the navigator object"), _("Image")), + # Translators: Description for the [screen_full] variable in the Variables Guide. + ("[screen_full]", _("Screenshot of the entire screen"), _("Image")), + # Translators: Description and input type for the [file_ocr] variable in the Variables Guide. + ("[file_ocr]", _("Select image/PDF/TIFF for text extraction"), _("Image, PDF, TIFF")), + # Translators: Description and input type for the [file_read] variable in the Variables Guide. + ("[file_read]", _("Select document for reading"), _("TXT, Code, PDF")), + # Translators: Description and input type for the [file_audio] variable in the Variables Guide. + ("[file_audio]", _("Select audio file for analysis"), _("MP3, WAV, OGG")), +) diff --git a/addon/globalPlugins/visionAssistant/dialogs.py b/addon/globalPlugins/visionAssistant/dialogs.py new file mode 100644 index 0000000..8d2e6fd --- /dev/null +++ b/addon/globalPlugins/visionAssistant/dialogs.py @@ -0,0 +1,1014 @@ +# -*- coding: utf-8 -*- + +import os +import json +import io +import tempfile +import threading +import time +import gc +import wave +import logging +import base64 +from concurrent.futures import ThreadPoolExecutor + +import wx + +import addonHandler +import config +import gui +import ui +import api +import textInfos +import tones +import scriptHandler + +from .prompt_manager_dialog import PromptManagerDialog +from .constants import ( + ADDON_NAME, + GEMINI_VOICES, + MODELS, + OCR_ENGINES, + PROMPT_VARIABLES_GUIDE, + SOURCE_NAMES, + TARGET_NAMES, +) +from .markdown_utils import clean_markdown, markdown_to_html +from .prompt_helpers import ( + get_configured_default_prompts, + load_configured_custom_prompts, + serialize_custom_prompts_v2, + serialize_default_prompt_overrides, +) +from .services import ( + ChromeOCREngine, + GeminiHandler, + GoogleTranslator, + SmartProgrammersOCREngine, + get_file_path, + get_mime_type, + show_error_dialog, +) + +try: + import fitz +except ImportError: + fitz = None + +log = logging.getLogger(__name__) +addonHandler.initTranslation() +_vision_assistant_instance = None + + +def set_vision_assistant_instance(instance): + global _vision_assistant_instance + _vision_assistant_instance = instance + +class VisionQADialog(wx.Dialog): + def __init__(self, parent, title, initial_text, context_data, callback_fn, extra_info=None, raw_content=None, status_callback=None, announce_on_open=True, allow_questions=True): + super(VisionQADialog, self).__init__(parent, title=title, size=(550, 500), style=wx.DEFAULT_DIALOG_STYLE | wx.RESIZE_BORDER) + self.context_data = context_data + self.callback_fn = callback_fn + self.extra_info = extra_info + self.chat_history = [] + self.raw_content = raw_content + self.status_callback = status_callback + self.announce_on_open = announce_on_open + self.allow_questions = allow_questions + + mainSizer = wx.BoxSizer(wx.VERTICAL) + # Translators: Label for the AI response text area in a chat dialog + lbl_text = _("AI Response:") + lbl = wx.StaticText(self, label=lbl_text) + mainSizer.Add(lbl, 0, wx.ALL, 5) + self.outputArea = wx.TextCtrl(self, style=wx.TE_MULTILINE | wx.TE_READONLY) + mainSizer.Add(self.outputArea, 1, wx.EXPAND | wx.ALL, 5) + + self.should_clean = config.conf["VisionAssistant"]["clean_markdown_chat"] + display_text = clean_markdown(initial_text) if self.should_clean else initial_text + if display_text: + # Translators: Format for displaying AI message in a chat dialog + init_msg = _("AI: {text}\n").format(text=display_text) + self.outputArea.AppendText(init_msg) + if config.conf["VisionAssistant"]["copy_to_clipboard"]: + api.copyToClip(raw_content if raw_content else display_text) + + if not (extra_info and extra_info.get('skip_init_history')): + self.chat_history.append({"role": "model", "parts": [{"text": initial_text}]}) + + self.inputArea = None + if allow_questions: + # Translators: Label for user input field in a chat dialog + ask_text = _("Ask:") + inputLbl = wx.StaticText(self, label=ask_text) + mainSizer.Add(inputLbl, 0, wx.ALL, 5) + self.inputArea = wx.TextCtrl(self, style=wx.TE_PROCESS_ENTER, size=(-1, 30)) + mainSizer.Add(self.inputArea, 0, wx.EXPAND | wx.ALL, 5) + + btnSizer = wx.BoxSizer(wx.HORIZONTAL) + self.askBtn = None + if allow_questions: + # Translators: Button to send message in a chat dialog + self.askBtn = wx.Button(self, label=_("Send")) + # Translators: Button to view the content in a formatted HTML window + self.viewBtn = wx.Button(self, label=_("View Formatted")) + self.viewBtn.Bind(wx.EVT_BUTTON, self.onView) + # Translators: Button to save only the result content without chat history + self.saveContentBtn = wx.Button(self, label=_("Save Content")) + self.saveContentBtn.Bind(wx.EVT_BUTTON, self.onSaveContent) + # Translators: Button to save chat in a chat dialog + self.saveBtn = wx.Button(self, label=_("Save Chat")) + # Translators: Button to close chat dialog + self.closeBtn = wx.Button(self, wx.ID_CANCEL, label=_("Close")) + + self.saveBtn.Enable(bool(initial_text.strip())) + self.viewBtn.Enable(bool(self.raw_content)) + self.saveContentBtn.Enable(bool(self.raw_content)) + + if self.askBtn: + btnSizer.Add(self.askBtn, 0, wx.ALL, 5) + btnSizer.Add(self.viewBtn, 0, wx.ALL, 5) + btnSizer.Add(self.saveContentBtn, 0, wx.ALL, 5) + btnSizer.Add(self.saveBtn, 0, wx.ALL, 5) + btnSizer.Add(self.closeBtn, 0, wx.ALL, 5) + mainSizer.Add(btnSizer, 0, wx.ALIGN_RIGHT) + + self.SetSizer(mainSizer) + if self.inputArea: + self.inputArea.SetFocus() + else: + self.outputArea.SetFocus() + if self.askBtn: + self.askBtn.Bind(wx.EVT_BUTTON, self.onAsk) + self.saveBtn.Bind(wx.EVT_BUTTON, self.onSave) + if self.inputArea: + self.inputArea.Bind(wx.EVT_TEXT_ENTER, self.onAsk) + if display_text and self.announce_on_open: + wx.CallLater(300, ui.message, display_text) + + def onAsk(self, event): + if not self.inputArea: + return + question = self.inputArea.Value + if not question.strip(): return + # Translators: Format for displaying User message in a chat dialog + user_msg = _("\nYou: {text}\n").format(text=question) + self.outputArea.AppendText(user_msg) + self.inputArea.Clear() + # Translators: Message shown while processing in a chat dialog + msg = _("Thinking...") + ui.message(msg) + threading.Thread(target=self.process_question, args=(question,), daemon=True).start() + + def process_question(self, question): + result_tuple = self.callback_fn(self.context_data, question, self.chat_history, self.extra_info) + response_text, _ = result_tuple + if response_text: + if not (self.extra_info and self.extra_info.get('file_context')): + self.chat_history.append({"role": "user", "parts": [{"text": question}]}) + self.chat_history.append({"role": "model", "parts": [{"text": response_text}]}) + final_text = clean_markdown(response_text) if self.should_clean else response_text + wx.CallAfter(self.update_response, final_text, response_text) + + def update_response(self, display_text, raw_text=None): + if raw_text: + self.raw_content = raw_text + self.viewBtn.Enable(True) + self.saveContentBtn.Enable(True) + # Translators: Format for displaying AI message in a chat dialog + ai_msg = _("AI: {text}\n").format(text=display_text) + self.outputArea.AppendText(ai_msg) + self.saveBtn.Enable(True) + if config.conf["VisionAssistant"]["copy_to_clipboard"]: + api.copyToClip(raw_text if raw_text else display_text) + self.outputArea.ShowPosition(self.outputArea.GetLastPosition()) + ui.message(display_text) + + def report_save(self, msg): + if self.status_callback: self.status_callback(msg) + else: ui.message(msg) + + def onView(self, event): + full_html = "" + # Translators: Format for displaying User message in a chat dialog + user_label = _("\nYou: {text}\n").format(text="").strip() + # Translators: Format for displaying AI message in a chat dialog + ai_label = _("AI: {text}\n").format(text="").strip() + + if self.chat_history: + for item in self.chat_history: + role = item.get("role", "") + text = item.get("parts", [{}])[0].get("text", "") + if role == "user": + safe_text = text.replace("&", "&").replace("<", "<").replace(">", ">") + full_html += f"

{user_label}

{safe_text}

" + elif role == "model": + formatted_text = markdown_to_html(text, full_page=False) + full_html += f"

{ai_label}

{formatted_text}
" + + if not full_html and self.raw_content: + formatted_text = markdown_to_html(self.raw_content, full_page=False) + full_html += f"

{ai_label}

{formatted_text}" + + if not full_html: return + try: + # Translators: Title of the formatted result window + ui.browseableMessage(full_html, _("Formatted Conversation"), isHtml=True) + except Exception as e: + # Translators: Error message if viewing fails + msg = _("Error displaying content: {error}").format(error=e) + show_error_dialog(msg) + + def onSave(self, event): + # Translators: Save dialog title + path = get_file_path(_("Save Chat Log"), "Text files (*.txt)|*.txt", mode="save") + if path: + try: + with open(path, "w", encoding="utf-8") as f: f.write(self.outputArea.GetValue()) + # Translators: Message shown on successful save of a file. + self.report_save(_("Saved.")) + except Exception as e: + # Translators: Message in the error dialog when saving fails. + msg = _("Save failed: {error}").format(error=e) + show_error_dialog(msg) + + def onSaveContent(self, event): + # Translators: Save dialog title + path = get_file_path(_("Save Result"), "HTML files (*.html)|*.html", mode="save") + if path: + try: + full_html = markdown_to_html(self.raw_content, full_page=True) + with open(path, "w", encoding="utf-8") as f: f.write(full_html) + # Translators: Message on successful save + self.report_save(_("Saved.")) + except Exception as e: + # Translators: Message in the error dialog when saving fails. + msg = _("Save failed: {error}").format(error=e) + show_error_dialog(msg) + +class SettingsPanel(gui.settingsDialogs.SettingsPanel): + title = ADDON_NAME + def makeSettings(self, settingsSizer): + # --- Connection Group --- + # Translators: Title of the settings group for connection and updates + groupLabel = _("Connection") + self.connectionBox = wx.StaticBox(self, label=groupLabel) + connectionSizer = wx.StaticBoxSizer(self.connectionBox, wx.VERTICAL) + cHelper = gui.guiHelper.BoxSizerHelper(self.connectionBox, sizer=connectionSizer) + + # Translators: Label for API Key input + apiLabel = wx.StaticText(self.connectionBox, label=_("Gemini API Key (Separate multiple keys with comma or newline):")) + cHelper.addItem(apiLabel) + + api_value = config.conf["VisionAssistant"]["api_key"] + + self.apiKeyCtrl_hidden = wx.TextCtrl(self.connectionBox, value=api_value, style=wx.TE_PASSWORD, size=(-1, -1)) + + self.apiKeyCtrl_visible = wx.TextCtrl(self.connectionBox, value=api_value, style=wx.TE_MULTILINE | wx.TE_DONTWRAP, size=(-1, 60)) + self.apiKeyCtrl_visible.Hide() + + cHelper.addItem(self.apiKeyCtrl_hidden) + cHelper.addItem(self.apiKeyCtrl_visible) + + # Translators: Checkbox to toggle API Key visibility + self.showApiCheck = wx.CheckBox(self.connectionBox, label=_("Show API Key")) + self.showApiCheck.Bind(wx.EVT_CHECKBOX, self.onToggleApiVisibility) + cHelper.addItem(self.showApiCheck) + + model_display_names = [opt[0] for opt in MODELS] + # Translators: Label for Model selection + self.model = cHelper.addLabeledControl(_("AI Model:"), wx.Choice, choices=model_display_names) + current_id = config.conf["VisionAssistant"]["model_name"] + try: + index = next(i for i, v in enumerate(MODELS) if v[1] == current_id) + self.model.SetSelection(index) + except StopIteration: self.model.SetSelection(0) + + # Translators: Label for Proxy URL input + self.proxyUrl = cHelper.addLabeledControl(_("Proxy URL:"), wx.TextCtrl) + self.proxyUrl.Value = config.conf["VisionAssistant"]["proxy_url"] + + # Translators: Checkbox to enable/disable automatic update checks on NVDA startup + self.checkUpdateStartup = cHelper.addItem(wx.CheckBox(self.connectionBox, label=_("Check for updates on startup"))) + self.checkUpdateStartup.Value = config.conf["VisionAssistant"]["check_update_startup"] + # Translators: Checkbox to toggle markdown cleaning in chat windows + self.cleanMarkdown = cHelper.addItem(wx.CheckBox(self.connectionBox, label=_("Clean Markdown in Chat"))) + self.cleanMarkdown.Value = config.conf["VisionAssistant"]["clean_markdown_chat"] + # Translators: Checkbox to enable copying AI responses to clipboard + self.copyToClipboard = cHelper.addItem(wx.CheckBox(self.connectionBox, label=_("Copy AI responses to clipboard"))) + self.copyToClipboard.Value = config.conf["VisionAssistant"]["copy_to_clipboard"] + # Translators: Checkbox to skip chat window and only speak AI responses + self.skipChatDialog = cHelper.addItem(wx.CheckBox(self.connectionBox, label=_("Direct Output (No Chat Window)"))) + self.skipChatDialog.Value = config.conf["VisionAssistant"]["skip_chat_dialog"] + settingsSizer.Add(connectionSizer, 0, wx.EXPAND | wx.ALL, 5) + + # --- Translation Languages Group --- + # Translators: Title of the settings group for translation languages configuration + groupLabel = _("Translation Languages") + langBox = wx.StaticBox(self, label=groupLabel) + langSizer = wx.StaticBoxSizer(langBox, wx.VERTICAL) + lHelper = gui.guiHelper.BoxSizerHelper(langBox, sizer=langSizer) + + # Translators: Label for Source Language selection + self.sourceLang = lHelper.addLabeledControl(_("Source:"), wx.Choice, choices=SOURCE_NAMES) + try: self.sourceLang.SetSelection(SOURCE_NAMES.index(config.conf["VisionAssistant"]["source_language"])) + except: self.sourceLang.SetSelection(0) + + # Translators: Label for Target Language selection + self.targetLang = lHelper.addLabeledControl(_("Target:"), wx.Choice, choices=TARGET_NAMES) + try: self.targetLang.SetSelection(TARGET_NAMES.index(config.conf["VisionAssistant"]["target_language"])) + except: self.targetLang.SetSelection(0) + + # Translators: Label for AI Response Language selection + self.aiResponseLang = lHelper.addLabeledControl(_("AI Response:"), wx.Choice, choices=TARGET_NAMES) + try: self.aiResponseLang.SetSelection(TARGET_NAMES.index(config.conf["VisionAssistant"]["ai_response_language"])) + except: self.aiResponseLang.SetSelection(0) + + # Translators: Checkbox for Smart Swap feature + self.smartSwap = lHelper.addItem(wx.CheckBox(langBox, label=_("Smart Swap"))) + self.smartSwap.Value = config.conf["VisionAssistant"]["smart_swap"] + settingsSizer.Add(langSizer, 0, wx.EXPAND | wx.ALL, 5) + + # --- Document Reader Settings --- + # Translators: Title of settings group for Document Reader features + groupLabel = _("Document Reader") + docBox = wx.StaticBox(self, label=groupLabel) + docSizer = wx.StaticBoxSizer(docBox, wx.VERTICAL) + dHelper = gui.guiHelper.BoxSizerHelper(docBox, sizer=docSizer) + + # Translators: Label for OCR Engine selection + self.ocr_sel = dHelper.addLabeledControl(_("OCR Engine:"), wx.Choice, choices=[x[0] for x in OCR_ENGINES]) + curr_ocr = config.conf["VisionAssistant"]["ocr_engine"] + try: + o_idx = next(i for i, v in enumerate(OCR_ENGINES) if v[1] == curr_ocr) + self.ocr_sel.SetSelection(o_idx) + except: self.ocr_sel.SetSelection(0) + + voice_choices = [f"{v[0]} - {v[1]}" for v in GEMINI_VOICES] + # Translators: Label for TTS Voice selection + self.voice_sel = dHelper.addLabeledControl(_("TTS Voice:"), wx.Choice, choices=voice_choices) + curr_voice = config.conf["VisionAssistant"]["tts_voice"] + try: + v_idx = next(i for i, v in enumerate(GEMINI_VOICES) if v[0] == curr_voice) + self.voice_sel.SetSelection(v_idx) + except: self.voice_sel.SetSelection(1) + settingsSizer.Add(docSizer, 0, wx.EXPAND | wx.ALL, 5) + + # --- CAPTCHA Group --- + # Translators: Title of the settings group for CAPTCHA options + groupLabel = _("CAPTCHA") + capBox = wx.StaticBox(self, label=groupLabel) + capSizer = wx.StaticBoxSizer(capBox, wx.VERTICAL) + capHelper = gui.guiHelper.BoxSizerHelper(capBox, sizer=capSizer) + # Translators: Label for CAPTCHA capture method selection + self.captchaMode = capHelper.addLabeledControl(_("Capture Method:"), wx.Choice, choices=[ + # Translators: A choice for capture method. Captures only the specific object under the NVDA navigator cursor. + _("Navigator Object"), + # Translators: A choice for capture method. Captures the entire visible screen area. + _("Full Screen") + ]) + self.captchaMode.SetSelection(0 if config.conf["VisionAssistant"]["captcha_mode"] == 'navigator' else 1) + settingsSizer.Add(capSizer, 0, wx.EXPAND | wx.ALL, 5) + + self.defaultPromptItems = get_configured_default_prompts() + self.customPromptItems = load_configured_custom_prompts() + + # --- Prompt Manager Group --- + # Translators: Title of the settings group for prompt management + groupLabel = _("Prompts") + promptsBox = wx.StaticBox(self, label=groupLabel) + promptsSizer = wx.StaticBoxSizer(promptsBox, wx.VERTICAL) + pHelper = gui.guiHelper.BoxSizerHelper(promptsBox, sizer=promptsSizer) + # Translators: Description for the prompt manager button. + pHelper.addItem(wx.StaticText(promptsBox, label=_("Manage default and custom prompts."))) + # Translators: Button label to open prompt manager dialog. + self.managePromptsBtn = wx.Button(promptsBox, label=_("Manage Prompts...")) + self.managePromptsBtn.Bind(wx.EVT_BUTTON, self.onManagePrompts) + pHelper.addItem(self.managePromptsBtn) + self.promptsSummary = wx.StaticText(promptsBox) + pHelper.addItem(self.promptsSummary) + self._refreshPromptSummary() + settingsSizer.Add(promptsSizer, 0, wx.EXPAND | wx.ALL, 5) + + def _refreshPromptSummary(self): + # Translators: Summary text for prompt counts in settings. + summary = _("Default prompts: {defaultCount}, Custom prompts: {customCount}").format( + defaultCount=len(self.defaultPromptItems), + customCount=len(self.customPromptItems), + ) + self.promptsSummary.SetLabel(summary) + + def onManagePrompts(self, event): + top = wx.GetTopLevelParent(self) + dlg = PromptManagerDialog( + self, + self.defaultPromptItems, + self.customPromptItems, + PROMPT_VARIABLES_GUIDE, + ) + try: + if dlg.ShowModal() == wx.ID_OK: + self.defaultPromptItems = dlg.get_default_items() + self.customPromptItems = dlg.get_custom_items() + self._refreshPromptSummary() + finally: + dlg.Destroy() + if top: + top.Enable(True) + top.SetFocus() + + def onToggleApiVisibility(self, event): + if self.showApiCheck.IsChecked(): + self.apiKeyCtrl_visible.SetValue(self.apiKeyCtrl_hidden.GetValue()) + self.apiKeyCtrl_hidden.Hide() + self.apiKeyCtrl_visible.Show() + else: + self.apiKeyCtrl_hidden.SetValue(self.apiKeyCtrl_visible.GetValue()) + self.apiKeyCtrl_visible.Hide() + self.apiKeyCtrl_hidden.Show() + + self.connectionBox.GetParent().Layout() + + def onSave(self): + val = self.apiKeyCtrl_visible.GetValue() if self.showApiCheck.IsChecked() else self.apiKeyCtrl_hidden.GetValue() + config.conf["VisionAssistant"]["api_key"] = val.strip() + config.conf["VisionAssistant"]["model_name"] = MODELS[self.model.GetSelection()][1] + config.conf["VisionAssistant"]["proxy_url"] = self.proxyUrl.Value.strip() + config.conf["VisionAssistant"]["source_language"] = SOURCE_NAMES[self.sourceLang.GetSelection()] + config.conf["VisionAssistant"]["target_language"] = TARGET_NAMES[self.targetLang.GetSelection()] + config.conf["VisionAssistant"]["ai_response_language"] = TARGET_NAMES[self.aiResponseLang.GetSelection()] + config.conf["VisionAssistant"]["smart_swap"] = self.smartSwap.Value + config.conf["VisionAssistant"]["check_update_startup"] = self.checkUpdateStartup.Value + config.conf["VisionAssistant"]["clean_markdown_chat"] = self.cleanMarkdown.Value + config.conf["VisionAssistant"]["copy_to_clipboard"] = self.copyToClipboard.Value + config.conf["VisionAssistant"]["skip_chat_dialog"] = self.skipChatDialog.Value + config.conf["VisionAssistant"]["captcha_mode"] = 'navigator' if self.captchaMode.GetSelection() == 0 else 'fullscreen' + config.conf["VisionAssistant"]["custom_prompts_v2"] = serialize_custom_prompts_v2(self.customPromptItems) + config.conf["VisionAssistant"]["custom_prompts"] = "" + config.conf["VisionAssistant"]["default_refine_prompts"] = serialize_default_prompt_overrides(self.defaultPromptItems) + config.conf["VisionAssistant"]["ocr_engine"] = OCR_ENGINES[self.ocr_sel.GetSelection()][1] + config.conf["VisionAssistant"]["tts_voice"] = GEMINI_VOICES[self.voice_sel.GetSelection()][0] + +class RangeDialog(wx.Dialog): + def __init__(self, parent, total_pages): + # Translators: Title of the PDF options dialog + super().__init__(parent, title=_("Options"), size=(350, 320)) + sizer = wx.BoxSizer(wx.VERTICAL) + # Translators: Label showing total pages found + sizer.Add(wx.StaticText(self, label=_("Total Pages (All Files): {count}").format(count=total_pages)), 0, wx.ALL, 10) + + # Translators: Box title for page range selection + box_range = wx.StaticBoxSizer(wx.VERTICAL, self, _("Range")) + g_sizer = wx.FlexGridSizer(2, 2, 10, 10) + # Translators: Label for start page + g_sizer.Add(wx.StaticText(self, label=_("From:")), 0, wx.ALIGN_CENTER_VERTICAL) + self.spin_from = wx.SpinCtrl(self, min=1, max=total_pages, initial=1) + g_sizer.Add(self.spin_from, 1, wx.EXPAND) + # Translators: Label for end page + g_sizer.Add(wx.StaticText(self, label=_("To:")), 0, wx.ALIGN_CENTER_VERTICAL) + self.spin_to = wx.SpinCtrl(self, min=1, max=total_pages, initial=total_pages) + g_sizer.Add(self.spin_to, 1, wx.EXPAND) + box_range.Add(g_sizer, 1, wx.EXPAND | wx.ALL, 5) + sizer.Add(box_range, 0, wx.EXPAND | wx.ALL, 10) + + # Translators: Box title for translation options + box_trans = wx.StaticBoxSizer(wx.VERTICAL, self, _("Translation")) + # Translators: Checkbox to enable translation + self.chk_trans = wx.CheckBox(self, label=_("Translate Output")) + box_trans.Add(self.chk_trans, 0, wx.ALL, 5) + h_sizer = wx.BoxSizer(wx.HORIZONTAL) + # Translators: Label for target language + h_sizer.Add(wx.StaticText(self, label=_("Target:")), 0, wx.ALIGN_CENTER_VERTICAL | wx.RIGHT, 5) + self.cmb_lang = wx.Choice(self, choices=TARGET_NAMES) + self.cmb_lang.SetSelection(0) + h_sizer.Add(self.cmb_lang, 1) + box_trans.Add(h_sizer, 1, wx.EXPAND | wx.ALL, 5) + sizer.Add(box_trans, 0, wx.EXPAND | wx.ALL, 10) + + btn_sizer = wx.BoxSizer(wx.HORIZONTAL) + # Translators: Button to start processing + btn_ok = wx.Button(self, wx.ID_OK, label=_("Start")) + btn_ok.SetDefault() + # Translators: Button to cancel + btn_cancel = wx.Button(self, wx.ID_CANCEL, label=_("Cancel")) + btn_sizer.Add(btn_ok, 0, wx.RIGHT, 10) + btn_sizer.Add(btn_cancel, 0) + sizer.Add(btn_sizer, 0, wx.ALIGN_CENTER | wx.ALL, 10) + self.SetSizer(sizer) + + self.chk_trans.Bind(wx.EVT_CHECKBOX, self.on_check) + self.cmb_lang.Disable() + + def on_check(self, event): + self.cmb_lang.Enable(self.chk_trans.IsChecked()) + + def get_settings(self): + return { + 'start': self.spin_from.GetValue() - 1, + 'end': self.spin_to.GetValue() - 1, + 'translate': self.chk_trans.IsChecked(), + 'lang': TARGET_NAMES[self.cmb_lang.GetSelection()] + } + +class ChatDialog(wx.Dialog): + instance = None + + def __init__(self, parent, file_path): + # Translators: Title of the chat dialog + super().__init__(parent, title=_("Ask about Document"), size=(600, 500), style=wx.DEFAULT_DIALOG_STYLE | wx.RESIZE_BORDER) + ChatDialog.instance = self + self.file_path = file_path + self.file_uri = None + self.mime_type = get_mime_type(file_path) + self.history = [] + + sizer = wx.BoxSizer(wx.VERTICAL) + # Translators: Label showing the analyzed file name + lbl_info = wx.StaticText(self, label=_("File: {name}").format(name=os.path.basename(file_path))) + sizer.Add(lbl_info, 0, wx.ALL, 5) + self.display = wx.TextCtrl(self, style=wx.TE_MULTILINE | wx.TE_READONLY | wx.TE_RICH2) + sizer.Add(self.display, 1, wx.EXPAND | wx.ALL, 10) + # Translators: Status message while uploading + self.display.SetValue(_("Uploading to Gemini...\n")) + + input_sizer = wx.BoxSizer(wx.HORIZONTAL) + # Translators: Label for the chat input field + input_sizer.Add(wx.StaticText(self, label=_("Your Question:")), 0, wx.ALIGN_CENTER_VERTICAL | wx.RIGHT, 5) + self.input = wx.TextCtrl(self, style=wx.TE_PROCESS_ENTER, size=(-1, 30)) + self.input.Bind(wx.EVT_TEXT_ENTER, self.on_send) + input_sizer.Add(self.input, 1, wx.EXPAND | wx.RIGHT, 5) + + # Translators: Button to send message + self.btn_send = wx.Button(self, label=_("Send")) + self.btn_send.Bind(wx.EVT_BUTTON, self.on_send) + self.btn_send.Disable() + input_sizer.Add(self.btn_send, 0) + sizer.Add(input_sizer, 0, wx.EXPAND | wx.ALL, 10) + self.SetSizer(sizer) + self.Bind(wx.EVT_CLOSE, self.on_close) + threading.Thread(target=self.init_upload, daemon=True).start() + + def on_close(self, event): + ChatDialog.instance = None + self.Destroy() + + def init_upload(self): + uri = GeminiHandler.upload_for_chat(self.file_path, self.mime_type) + if uri and not str(uri).startswith("ERROR:"): + self.file_uri = uri + wx.CallAfter(self.on_ready) + else: + err_msg = str(uri)[6:] if uri else _("Upload failed.") + wx.CallAfter(show_error_dialog, err_msg) + wx.CallAfter(self.Close) + + def on_ready(self): + # Translators: Message when ready to chat + self.display.AppendText(_("Ready! Ask your questions.\n")) + self.btn_send.Enable() + self.input.SetFocus() + + def on_send(self, event): + msg = self.input.GetValue().strip() + if not msg: return + self.input.Clear() + self.display.AppendText(f"You: {msg}\n") + # Translators: Message showing AI is thinking + ui.message(_("Thinking...")) + threading.Thread(target=self.do_chat, args=(msg,), daemon=True).start() + + def do_chat(self, msg): + resp = GeminiHandler.chat(self.history, msg, self.file_uri, self.mime_type) + + if str(resp).startswith("ERROR:"): + show_error_dialog(resp[6:]) + if _vision_assistant_instance: + # Translators: Initial status when the add-on is doing nothing + _vision_assistant_instance.current_status = _("Idle") + return + + self.history.append({"role": "user", "parts": [{"text": msg}]}) + self.history.append({"role": "model", "parts": [{"text": resp}]}) + wx.CallAfter(self.display.AppendText, f"AI: {resp}\n\n") + # Translators: Spoken prefix for AI response + wx.CallAfter(ui.message, _("AI: ") + resp) + +class DocumentViewerDialog(wx.Dialog): + def __init__(self, parent, virtual_doc, settings): + # Translators: Title of the Document Reader window. + title_text = f"{ADDON_NAME} - {_('Document Reader')}" + super().__init__(parent, title=title_text, size=(800, 600), style=wx.DEFAULT_DIALOG_STYLE | wx.RESIZE_BORDER | wx.MAXIMIZE_BOX) + self.v_doc = virtual_doc + self.start_page = settings['start'] + self.end_page = settings['end'] + self.do_translate = settings['translate'] + self.target_lang = settings['lang'] + self.range_count = self.end_page - self.start_page + 1 + self.page_cache = {} + self.current_page = self.start_page + self.thread_pool = ThreadPoolExecutor(max_workers=5) + + self.init_ui() + self.Centre() + threading.Thread(target=self.start_auto_processing, daemon=True).start() + + def init_ui(self): + panel = wx.Panel(self) + vbox = wx.BoxSizer(wx.VERTICAL) + # Translators: Initial status message + self.lbl_status = wx.StaticText(panel, label=_("Initializing...")) + vbox.Add(self.lbl_status, 0, wx.ALL, 5) + self.txt_content = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY | wx.TE_RICH2) + vbox.Add(self.txt_content, 1, wx.EXPAND | wx.LEFT | wx.RIGHT, 10) + hbox_nav = wx.BoxSizer(wx.HORIZONTAL) + # Translators: Button to go to previous page + self.btn_prev = wx.Button(panel, label=_("Previous (Ctrl+PageUp)")) + self.btn_prev.Bind(wx.EVT_BUTTON, self.on_prev) + hbox_nav.Add(self.btn_prev, 0, wx.RIGHT, 5) + # Translators: Button to go to next page + self.btn_next = wx.Button(panel, label=_("Next (Ctrl+PageDown)")) + self.btn_next.Bind(wx.EVT_BUTTON, self.on_next) + hbox_nav.Add(self.btn_next, 0, wx.RIGHT, 15) + # Translators: Label for Go To Page + hbox_nav.Add(wx.StaticText(panel, label=_("Go to:")), 0, wx.ALIGN_CENTER_VERTICAL | wx.RIGHT, 5) + choices = [str(i+1) for i in range(self.start_page, self.end_page + 1)] + self.cmb_pages = wx.Choice(panel, choices=choices) + self.cmb_pages.Bind(wx.EVT_CHOICE, self.on_page_select) + hbox_nav.Add(self.cmb_pages, 0, wx.RIGHT, 15) + vbox.Add(hbox_nav, 0, wx.ALIGN_CENTER | wx.ALL, 10) + hbox_actions = wx.BoxSizer(wx.HORIZONTAL) + # Translators: Button to Ask questions about the document + self.btn_ask = wx.Button(panel, label=_("Ask AI (Alt+A)")) + self.btn_ask.Bind(wx.EVT_BUTTON, self.on_ask) + hbox_actions.Add(self.btn_ask, 0, wx.RIGHT, 5) + + # Translators: Button to force re-scan + self.btn_gemini = wx.Button(panel, label=_("Re-scan with Gemini (Alt+R)")) + self.btn_gemini.Bind(wx.EVT_BUTTON, self.on_gemini_scan) + hbox_actions.Add(self.btn_gemini, 0, wx.RIGHT, 5) + + # Translators: Button to generate audio + self.btn_tts = wx.Button(panel, label=_("Generate Audio (Alt+G)")) + self.btn_tts.Bind(wx.EVT_BUTTON, self.on_tts) + hbox_actions.Add(self.btn_tts, 0, wx.RIGHT, 5) + + # Translators: Button to view formatted content + self.btn_view = wx.Button(panel, label=_("View Formatted")) + self.btn_view.Bind(wx.EVT_BUTTON, self.on_view) + hbox_actions.Add(self.btn_view, 0, wx.RIGHT, 5) + + # Translators: Button to save text + self.btn_save = wx.Button(panel, label=_("Save (Alt+S)")) + self.btn_save.Bind(wx.EVT_BUTTON, self.on_save_all) + hbox_actions.Add(self.btn_save, 0) + + vbox.Add(hbox_actions, 0, wx.ALIGN_CENTER | wx.ALL, 5) + btn_close = wx.Button(panel, wx.ID_CLOSE, label=_("Close")) + btn_close.Bind(wx.EVT_BUTTON, lambda e: self.Destroy()) + vbox.Add(btn_close, 0, wx.ALIGN_RIGHT | wx.ALL, 10) + panel.SetSizer(vbox) + accel_tbl = wx.AcceleratorTable([ + (wx.ACCEL_CTRL, wx.WXK_PAGEDOWN, self.btn_next.GetId()), + (wx.ACCEL_CTRL, wx.WXK_PAGEUP, self.btn_prev.GetId()), + (wx.ACCEL_CTRL, ord('S'), self.btn_save.GetId()), + (wx.ACCEL_ALT, ord('S'), self.btn_save.GetId()), + (wx.ACCEL_ALT, ord('A'), self.btn_ask.GetId()), + (wx.ACCEL_ALT, ord('R'), self.btn_gemini.GetId()), + (wx.ACCEL_ALT, ord('G'), self.btn_tts.GetId()) + ]) + self.SetAcceleratorTable(accel_tbl) + self.cmb_pages.SetSelection(0) + self.update_view() + self.txt_content.SetFocus() + + def start_auto_processing(self): + engine = config.conf["VisionAssistant"]["ocr_engine"] + + if engine == 'gemini': + threading.Thread(target=self.gemini_scan_batch_thread, daemon=True).start() + else: + for i in range(self.start_page, self.end_page + 1): + self.thread_pool.submit(self.process_page_worker, i) + + def process_page_worker(self, page_num): + if page_num in self.page_cache: return + text = self._get_page_text_logic(page_num) + self.page_cache[page_num] = text + if page_num == self.current_page: + wx.CallAfter(self.update_view) + # Translators: Spoken message when the current page is ready + wx.CallAfter(ui.message, _("Page {num} ready").format(num=page_num + 1)) + + def _get_page_text_logic(self, page_num): + file_path, page_idx = self.v_doc.get_page_info(page_num) + if not file_path: return "" + try: + doc = fitz.open(file_path) + page = doc.load_page(page_idx) + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) + img_bytes = pix.tobytes("jpg") + doc.close() + engine = config.conf["VisionAssistant"]["ocr_engine"] + text = None + if engine == 'gemini': + try: text = GeminiHandler.ocr_page(img_bytes) + except: text = None + if not text or not text.strip() or engine == 'chrome': + text = ChromeOCREngine.recognize(img_bytes) + if not text or not text.strip(): + text = SmartProgrammersOCREngine.recognize(img_bytes) + if not text or not text.strip(): + # Translators: Placeholder text when OCR fails + text = _("[OCR failed. Try Gemini Re-scan.]") + if self.do_translate and text and "[OCR failed" not in text: + if engine == 'gemini': + text = GeminiHandler.translate(text, self.target_lang) + else: + text = GoogleTranslator.translate(text, self.target_lang) + return text + except: + # Translators: Error message for page processing failure + return _("Error processing page.") + + def update_view(self): + rel_page = self.current_page - self.start_page + 1 + # Translators: Status label format + self.lbl_status.SetLabel(_("Page {current} of {total}").format(current=rel_page, total=self.range_count)) + if self.current_page in self.page_cache: + self.txt_content.SetValue(self.page_cache[self.current_page]) + self.txt_content.SetInsertionPoint(0) + self.txt_content.SetFocus() + else: + # Translators: Status when page is loading + self.txt_content.SetValue(_("Processing in background...")) + self.txt_content.SetInsertionPoint(0) + self.txt_content.SetFocus() + self.btn_prev.Enable(self.current_page > self.start_page) + self.btn_next.Enable(self.current_page < self.end_page) + + def load_page(self, page_num): + if page_num < self.start_page or page_num > self.end_page: return + self.current_page = page_num + self.cmb_pages.SetSelection(page_num - self.start_page) + # Translators: Spoken message when switching pages + ui.message(_("Page {num}").format(num=page_num + 1)) + self.update_view() + + def on_prev(self, event): + if self.current_page > self.start_page: self.load_page(self.current_page - 1) + + def on_next(self, event): + if self.current_page < self.end_page: self.load_page(self.current_page + 1) + + def on_page_select(self, event): + self.load_page(self.start_page + self.cmb_pages.GetSelection()) + + def on_view(self, event): + full_html = [] + for i in range(self.start_page, self.end_page + 1): + if i in self.page_cache: + page_text = self.page_cache[i] + page_content = markdown_to_html(page_text, full_page=False) + # Translators: Heading for each page in the formatted content view. + page_label = _("Page {num}").format(num=i+1) + full_html.append(f"

{page_label}

") + full_html.append(page_content) + full_html.append("
") + + if not full_html: + text = self.txt_content.GetValue() + if not text: return + full_html.append(markdown_to_html(text, full_page=False)) + + combined_html = "".join(full_html) + try: + # Translators: Title of the formatted result window + ui.browseableMessage(combined_html, _("Formatted Content"), isHtml=True) + except Exception as e: + show_error_dialog(str(e)) + + def on_gemini_scan(self, event): + if not config.conf["VisionAssistant"]["api_key"]: + wx.MessageBox(_("Please configure Gemini API Key."), _("Error"), wx.ICON_ERROR) + return + menu = wx.Menu() + # Translators: Menu option for current page + item_curr = menu.Append(wx.ID_ANY, _("Current Page")) + # Translators: Menu option for all pages + item_all = menu.Append(wx.ID_ANY, _("All Pages (In Range)")) + self.Bind(wx.EVT_MENU, self.do_rescan_current, item_curr) + self.Bind(wx.EVT_MENU, self.do_rescan_all, item_all) + self.PopupMenu(menu) + menu.Destroy() + + def do_rescan_current(self, event): + if self.current_page in self.page_cache: del self.page_cache[self.current_page] + self.update_view() + # Translators: Message during manual scan + ui.message(_("Scanning with Gemini...")) + threading.Thread(target=self.gemini_scan_single_thread, args=(self.current_page,), daemon=True).start() + + def gemini_scan_single_thread(self, page_num): + try: + file_path, page_idx = self.v_doc.get_page_info(page_num) + doc = fitz.open(file_path) + page = doc.load_page(page_idx) + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) + text = GeminiHandler.ocr_page(pix.tobytes("jpg")) + doc.close() + if self.do_translate: text = GeminiHandler.translate(text, self.target_lang) + self.page_cache[page_num] = text + if self.current_page == page_num: + wx.CallAfter(self.update_view) + # Translators: Message when scan is complete + wx.CallAfter(ui.message, _("Scan complete")) + except: pass + + def do_rescan_all(self, event): + threading.Thread(target=self.gemini_scan_batch_thread, daemon=True).start() + + def gemini_scan_batch_thread(self): + # Translators: Message when batch scan starts + msg = _("Batch Processing Started") + if _vision_assistant_instance: _vision_assistant_instance.current_status = msg + wx.CallAfter(ui.message, msg) + + for i in range(self.start_page, self.end_page + 1): + if i in self.page_cache: del self.page_cache[i] + wx.CallAfter(self.update_view) + + upload_path = self.v_doc.create_merged_pdf(self.start_page, self.end_page) + if not upload_path: + # Translators: Error message if PDF creation fails + wx.CallAfter(self.lbl_status.SetLabel, _("Error creating temporary PDF.")) + return + + try: + count = (self.end_page - self.start_page) + 1 + results = GeminiHandler.upload_and_process_batch(upload_path, "application/pdf", count) + + if not results or (len(results) == 1 and str(results[0]).startswith("ERROR:")): + err_msg = results[0][6:] if results else _("Unknown error") + # Translators: Message reported when batch scan fails + error_text = _("Scan failed: {err}").format(err=err_msg) + for i in range(self.start_page, self.end_page + 1): + self.page_cache[i] = error_text + + wx.CallAfter(self.update_view) + wx.CallAfter(ui.message, error_text) + return + + for i, text_part in enumerate(results): + if i >= count: break + idx = self.start_page + i + clean = text_part.strip() + if self.do_translate: + clean = GeminiHandler.translate(clean, self.target_lang) + self.page_cache[idx] = clean + + wx.CallAfter(self.update_view) + # Translators: Message when batch scan is complete + final_msg = _("Batch Scan Complete") + if _vision_assistant_instance: + # Translators: Initial status when the add-on is doing nothing + _vision_assistant_instance.current_status = _("Idle") + wx.CallAfter(ui.message, final_msg) + finally: + if upload_path and os.path.exists(upload_path): + try: os.remove(upload_path) + except: pass + + def on_tts(self, event): + if not config.conf["VisionAssistant"]["api_key"]: + wx.MessageBox(_("Please configure Gemini API Key."), _("Error"), wx.ICON_ERROR) + return + menu = wx.Menu() + # Translators: Menu option for TTS current page + item_curr = menu.Append(wx.ID_ANY, _("Generate for Current Page")) + # Translators: Menu option for TTS all pages + item_all = menu.Append(wx.ID_ANY, _("Generate for All Pages (In Range)")) + self.Bind(wx.EVT_MENU, self.do_tts_current, item_curr) + self.Bind(wx.EVT_MENU, self.do_tts_all, item_all) + self.PopupMenu(menu) + menu.Destroy() + + def do_tts_current(self, event): + text = self.txt_content.GetValue().strip() + if not text: + # Translators: Error message when text field is empty + wx.MessageBox(_("No text to read."), "Error") + return + self._save_tts(text) + + def do_tts_all(self, event): + threading.Thread(target=self.tts_batch_thread, daemon=True).start() + + def tts_batch_thread(self): + full_text = [] + # Translators: Message while gathering text + wx.CallAfter(ui.message, _("Gathering text for audio...")) + for i in range(self.start_page, self.end_page + 1): + while i not in self.page_cache: time.sleep(0.1) + full_text.append(self.page_cache[i]) + final_text = "\n".join(full_text).strip() + if not final_text: return + wx.CallAfter(self._save_tts, final_text) + + def _save_tts(self, text): + # Translators: File dialog title for saving audio + path = get_file_path(_("Save Audio"), "MP3 Files (*.mp3)|*.mp3|WAV Files (*.wav)|*.wav", mode="save") + if path: + voice = config.conf["VisionAssistant"]["tts_voice"] + threading.Thread(target=self.tts_worker, args=(text, voice, path), daemon=True).start() + + def tts_worker(self, text, voice, path): + # Translators: Message while generating audio + msg = _("Generating Audio...") + if _vision_assistant_instance: _vision_assistant_instance.current_status = msg + wx.CallAfter(ui.message, msg) + try: + audio_b64 = GeminiHandler.generate_speech(text, voice) + if not audio_b64 or len(audio_b64) < 100: + wx.CallAfter(wx.MessageBox, f"TTS Error: {audio_b64}", "Error", wx.ICON_ERROR) + return + missing_padding = len(audio_b64) % 4 + if missing_padding: audio_b64 += '=' * (4 - missing_padding) + pcm_data = base64.b64decode(audio_b64) + + if path.lower().endswith(".mp3"): + import subprocess + lame_path = os.path.join(os.path.dirname(__file__), "lib", "lame.exe") + if not os.path.exists(lame_path): + wx.CallAfter(wx.MessageBox, _("lame.exe not found in lib folder."), "Error", wx.ICON_ERROR) + return + + process = subprocess.Popen( + [lame_path, "-r", "-s", "24", "-m", "m", "-b", "128", "--bitwidth", "16", "--resample", "24", "-q", "0", "-", path], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + creationflags=getattr(subprocess, 'CREATE_NO_WINDOW', 0) + ) + + process.communicate(input=pcm_data) + else: + with wave.open(path, "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(24000) + wf.writeframes(pcm_data) + + # Translators: Spoken message when audio is saved + res_msg = _("Audio Saved") + if _vision_assistant_instance: _vision_assistant_instance.current_status = _("Idle") + wx.CallAfter(ui.message, res_msg) + wx.CallAfter(wx.MessageBox, _("Audio file generated and saved successfully."), _("Success"), wx.OK | wx.ICON_INFORMATION) + except Exception as e: + if _vision_assistant_instance: _vision_assistant_instance.current_status = _("Idle") + wx.CallAfter(wx.MessageBox, f"TTS Error: {e}", "Error", wx.ICON_ERROR) + + def on_ask(self, event): + if not config.conf["VisionAssistant"]["api_key"]: + wx.MessageBox(_("Please configure Gemini API Key."), _("Error"), wx.ICON_ERROR) + return + if ChatDialog.instance: + ChatDialog.instance.Raise() + ChatDialog.instance.SetFocus() + return + file_path, _ = self.v_doc.get_page_info(self.current_page) + if file_path: + dlg = ChatDialog(self, file_path) + dlg.Show() + + def on_save_all(self, event): + # Translators: File dialog filter for saving text/html + wildcard = "Text File (*.txt)|*.txt|HTML File (*.html)|*.html" + # Translators: File dialog title for saving + path = get_file_path(_("Save"), wildcard, mode="save") + if path: + is_html = path.lower().endswith('.html') + self.btn_save.Disable() + threading.Thread(target=self.save_thread, args=(path, is_html), daemon=True).start() + + def save_thread(self, path, is_html): + full_content = [] + try: + for i in range(self.start_page, self.end_page + 1): + # Translators: Message showing save progress + wx.CallAfter(self.lbl_status.SetLabel, _("Saving Page {num}...").format(num=i+1)) + while i not in self.page_cache: time.sleep(0.1) + txt = self.page_cache[i] + if is_html: + h = markdown_to_html(txt) + if "" in h: h = h.split("")[1].split("")[0] + full_content.append(f"

Page {i+1}

{h}") + else: + full_content.append(f"--- Page {i+1} ---\n{txt}\n") + with open(path, "w", encoding="utf-8") as f: + if is_html: f.write(f"{''.join(full_content)}") + else: f.write("\n".join(full_content)) + # Translators: Status label when save is complete + wx.CallAfter(self.lbl_status.SetLabel, _("Saved")) + # Translators: Message box content for successful save + wx.CallAfter(wx.MessageBox, _("File saved successfully."), _("Success"), wx.OK | wx.ICON_INFORMATION) + except Exception as e: + wx.CallAfter(wx.MessageBox, f"Save Error: {e}", "Error", wx.ICON_ERROR) + finally: wx.CallAfter(self.btn_save.Enable) diff --git a/addon/globalPlugins/visionAssistant/markdown_utils.py b/addon/globalPlugins/visionAssistant/markdown_utils.py new file mode 100644 index 0000000..ba438a6 --- /dev/null +++ b/addon/globalPlugins/visionAssistant/markdown_utils.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +import re + + +def clean_markdown(text): + if not text: + return "" + text = re.sub(r'\*\*|__|[*_]', '', text) + text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE) + text = re.sub(r'```', '', text) + text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) + text = re.sub(r'^\s*-\s+', '', text, flags=re.MULTILINE) + return text.strip() + + +def markdown_to_html(text, full_page=False): + if not text: + return "" + + html = text.replace("&", "&").replace("<", "<").replace(">", ">") + html = re.sub(r'\*\*(.*?)\*\*', r'\1', html) + html = re.sub(r'__(.*?)__', r'\1', html) + html = re.sub(r'^### (.*)', r'

\1

', html, flags=re.M) + html = re.sub(r'^## (.*)', r'

\1

', html, flags=re.M) + html = re.sub(r'^# (.*)', r'

\1

', html, flags=re.M) + + lines = html.split('\n') + in_table = False + new_lines = [] + table_style = 'border="1" style="border-collapse: collapse; width: 100%; margin-bottom: 10px;"' + td_style = 'style="padding: 5px; border: 1px solid #ccc;"' + + for line in lines: + stripped = line.strip() + if stripped.startswith('|') or (stripped.count('|') > 1 and len(stripped) > 5): + if not in_table: + new_lines.append(f'') + in_table = True + if '---' in stripped: + continue + row_content = stripped.strip('|').split('|') + cells = "".join([f'' for c in row_content]) + new_lines.append(f'{cells}') + else: + if in_table: + new_lines.append('
{c.strip()}
') + in_table = False + if stripped: + new_lines.append(line + "
") + else: + new_lines.append("
") + if in_table: + new_lines.append('') + html_body = "".join(new_lines) + + if not full_page: + return html_body + return f"""{html_body}""" diff --git a/addon/globalPlugins/visionAssistant/prompt_helpers.py b/addon/globalPlugins/visionAssistant/prompt_helpers.py new file mode 100644 index 0000000..64d8c01 --- /dev/null +++ b/addon/globalPlugins/visionAssistant/prompt_helpers.py @@ -0,0 +1,283 @@ +# -*- coding: utf-8 -*- + +import json +import logging + +import addonHandler +import config + +from .constants import DEFAULT_SYSTEM_PROMPTS, LEGACY_REFINER_TOKENS, REFINE_PROMPT_KEYS + +addonHandler.initTranslation() +log = logging.getLogger(__name__) + + +def get_builtin_default_prompts(): + builtins = [] + for item in DEFAULT_SYSTEM_PROMPTS: + p = str(item["prompt"]).strip() + builtins.append({ + "key": item["key"], + "section": item["section"], + "label": item["label"], + "display_label": f"{item['section']} - {item['label']}", + "internal": bool(item.get("internal")), + "prompt": p, + "default": p, + }) + return builtins + + +def get_builtin_default_prompt_map(): + return {item["key"]: item for item in get_builtin_default_prompts()} + + +def _normalize_custom_prompt_items(items): + normalized = [] + if not isinstance(items, list): + return normalized + + for item in items: + if not isinstance(item, dict): + continue + name = item.get("name") + content = item.get("content") + if not isinstance(name, str) or not isinstance(content, str): + continue + name = name.strip() + content = content.strip() + if name and content: + normalized.append({"name": name, "content": content}) + return normalized + + +def parse_custom_prompts_legacy(raw_value): + items = [] + if not raw_value: + return items + + normalized = raw_value.replace("\r\n", "\n").replace("\r", "\n") + for line in normalized.split("\n"): + for segment in line.split("|"): + segment = segment.strip() + if not segment or ":" not in segment: + continue + name, content = segment.split(":", 1) + name = name.strip() + content = content.strip() + if name and content: + items.append({"name": name, "content": content}) + return items + + +def parse_custom_prompts_v2(raw_value): + if not isinstance(raw_value, str) or not raw_value.strip(): + return None + try: + data = json.loads(raw_value) + except Exception as e: + log.warning(f"Invalid custom_prompts_v2 config, falling back to legacy format: {e}") + return None + return _normalize_custom_prompt_items(data) + + +def serialize_custom_prompts_v2(items): + normalized = _normalize_custom_prompt_items(items) + if not normalized: + return "" + return json.dumps(normalized, ensure_ascii=False) + + +def load_configured_custom_prompts(): + try: + raw_v2 = config.conf["VisionAssistant"]["custom_prompts_v2"] + except Exception: + raw_v2 = "" + items_v2 = parse_custom_prompts_v2(raw_v2) + if items_v2 is not None: + return items_v2 + return parse_custom_prompts_legacy(config.conf["VisionAssistant"]["custom_prompts"]) + + +def _sanitize_default_prompt_overrides(data): + if not isinstance(data, dict): + return {}, False + + changed = False + mutable = dict(data) + # Migrate old key used in previous versions. + legacy_vision = mutable.pop("vision_image_analysis", None) + if legacy_vision is not None: + changed = True + if isinstance(legacy_vision, str) and legacy_vision.strip(): + legacy_text = legacy_vision.strip() + nav_value = mutable.get("vision_navigator_object") + if not isinstance(nav_value, str) or not nav_value.strip(): + mutable["vision_navigator_object"] = legacy_text + changed = True + full_value = mutable.get("vision_fullscreen") + if not isinstance(full_value, str) or not full_value.strip(): + mutable["vision_fullscreen"] = legacy_text + changed = True + + valid_keys = set(get_builtin_default_prompt_map().keys()) + sanitized = {} + for key, value in mutable.items(): + if key not in valid_keys or not isinstance(value, str): + changed = True + continue + prompt_text = value.strip() + if not prompt_text: + changed = True + continue + if key in LEGACY_REFINER_TOKENS and prompt_text == LEGACY_REFINER_TOKENS[key]: + # Drop old token-only overrides and fallback to current built-ins. + changed = True + continue + if prompt_text != value: + changed = True + sanitized[key] = prompt_text + return sanitized, changed + + +def migrate_prompt_config_if_needed(): + changed = False + + try: + raw_v2 = config.conf["VisionAssistant"]["custom_prompts_v2"] + except Exception: + raw_v2 = "" + raw_legacy = config.conf["VisionAssistant"]["custom_prompts"] + + v2_items = parse_custom_prompts_v2(raw_v2) + if v2_items is None: + target_items = parse_custom_prompts_legacy(raw_legacy) + else: + target_items = v2_items + + serialized_v2 = serialize_custom_prompts_v2(target_items) + if serialized_v2 != (raw_v2 or ""): + config.conf["VisionAssistant"]["custom_prompts_v2"] = serialized_v2 + changed = True + + # Legacy mirror is disabled. Clear old storage to prevent stale fallback data. + if raw_legacy: + config.conf["VisionAssistant"]["custom_prompts"] = "" + changed = True + + try: + raw_defaults = config.conf["VisionAssistant"]["default_refine_prompts"] + except Exception: + raw_defaults = "" + if isinstance(raw_defaults, str) and raw_defaults.strip(): + try: + defaults_data = json.loads(raw_defaults) + except Exception: + defaults_data = None + if isinstance(defaults_data, dict): + sanitized, migrated = _sanitize_default_prompt_overrides(defaults_data) + if migrated: + config.conf["VisionAssistant"]["default_refine_prompts"] = ( + json.dumps(sanitized, ensure_ascii=False) if sanitized else "" + ) + changed = True + + return changed + + +def load_default_prompt_overrides(): + try: + raw = config.conf["VisionAssistant"]["default_refine_prompts"] + except Exception: + raw = "" + if not isinstance(raw, str) or not raw.strip(): + return {} + + try: + data = json.loads(raw) + except Exception as e: + log.warning(f"Invalid default_refine_prompts config, using built-ins: {e}") + return {} + + overrides, _ = _sanitize_default_prompt_overrides(data) + return overrides + + +def get_configured_default_prompt_map(): + prompt_map = get_builtin_default_prompt_map() + overrides = load_default_prompt_overrides() + for key, override in overrides.items(): + if key not in prompt_map: + continue + if key in LEGACY_REFINER_TOKENS and override == LEGACY_REFINER_TOKENS[key]: + continue + prompt_map[key]["prompt"] = override + return prompt_map + + +def get_configured_default_prompts(): + prompt_map = get_configured_default_prompt_map() + items = [] + for item in DEFAULT_SYSTEM_PROMPTS: + if item.get("internal"): + continue + key = item["key"] + if key in prompt_map: + items.append(dict(prompt_map[key])) + items.sort(key=lambda item: item.get("display_label", "").casefold()) + return items + + +def get_prompt_text(prompt_key): + prompt_map = get_configured_default_prompt_map() + item = prompt_map.get(prompt_key) + if item: + return item["prompt"] + return "" + + +def serialize_default_prompt_overrides(items): + if not items: + return "" + + base_map = {item["key"]: item["prompt"] for item in get_builtin_default_prompts()} + overrides = {} + for item in items: + key = item.get("key") + prompt_text = item.get("prompt", "") + if key not in base_map: + continue + if not isinstance(prompt_text, str): + continue + prompt_text = prompt_text.strip() + if prompt_text and prompt_text != base_map[key]: + overrides[key] = prompt_text + + if not overrides: + return "" + return json.dumps(overrides, ensure_ascii=False) + + +def get_refine_menu_options(): + options = [] + prompt_map = get_configured_default_prompt_map() + for key in REFINE_PROMPT_KEYS: + item = prompt_map.get(key) + if item: + options.append((item["label"], item["prompt"])) + + for item in load_configured_custom_prompts(): + # Translators: Prefix for custom prompts in the Refine menu + options.append((_("Custom: ") + item["name"], item["content"])) + return options + + +def apply_prompt_template(template, replacements): + if not isinstance(template, str): + return "" + + text = template + for key, value in replacements: + text = text.replace("{" + key + "}", str(value)) + + return text.strip() diff --git a/addon/globalPlugins/visionAssistant/services.py b/addon/globalPlugins/visionAssistant/services.py new file mode 100644 index 0000000..b944d78 --- /dev/null +++ b/addon/globalPlugins/visionAssistant/services.py @@ -0,0 +1,586 @@ +# -*- coding: utf-8 -*- + +import os +import json +import base64 +import tempfile +import time +import ctypes +import re +import logging + +import wx +from urllib import request, error, parse +from urllib.parse import quote, urlencode +from http import cookiejar +from uuid import uuid4 + +try: + import fitz +except ImportError: + fitz = None + +import addonHandler +import config +import gui + +from .constants import ADDON_NAME, CHROME_OCR_KEYS, TARGET_CODES +from .prompt_helpers import apply_prompt_template, get_prompt_text + +log = logging.getLogger(__name__) +addonHandler.initTranslation() + +def get_mime_type(path): + ext = os.path.splitext(path)[1].lower() + if ext == '.pdf': return 'application/pdf' + if ext in ['.jpg', '.jpeg']: return 'image/jpeg' + if ext == '.png': return 'image/png' + if ext == '.webp': return 'image/webp' + if ext in ['.tif', '.tiff']: return 'image/jpeg' + if ext == '.mp3': return 'audio/mpeg' + if ext == '.wav': return 'audio/wav' + if ext == '.ogg': return 'audio/ogg' + if ext == '.mp4': return 'video/mp4' + return 'application/octet-stream' + +def show_error_dialog(message): + # Translators: Title of the error dialog box + title = _("{name} Error").format(name=ADDON_NAME) + wx.CallAfter(gui.messageBox, message, title, wx.OK | wx.ICON_ERROR) + +def send_ctrl_v(): + try: + user32 = ctypes.windll.user32 + VK_CONTROL = 0x11; VK_V = 0x56; KEYEVENTF_KEYUP = 0x0002 + user32.keybd_event(VK_CONTROL, 0, 0, 0) + user32.keybd_event(VK_V, 0, 0, 0) + user32.keybd_event(VK_V, 0, KEYEVENTF_KEYUP, 0) + user32.keybd_event(VK_CONTROL, 0, KEYEVENTF_KEYUP, 0) + return True + except Exception: + log.warning("Failed to send Ctrl+V", exc_info=True) + return False + +def get_proxy_opener(): + proxy_url = config.conf["VisionAssistant"]["proxy_url"].strip() + if proxy_url: + if "127.0.0.1" in proxy_url or "localhost" in proxy_url or ":" in proxy_url.split("/")[-1]: + handler = request.ProxyHandler({'http': proxy_url, 'https': proxy_url}) + return request.build_opener(handler) + return request.build_opener() + +def get_twitter_download_link(tweet_url): + cj = cookiejar.CookieJar() + opener = request.build_opener(request.HTTPCookieProcessor(cj)) + base_url = "https://savetwitter.net/en4" + api_url = "https://savetwitter.net/api/ajaxSearch" + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'X-Requested-With': 'XMLHttpRequest', 'Referer': base_url} + try: + req_init = request.Request(base_url, headers=headers) + opener.open(req_init) + params = {'q': tweet_url, 'lang': 'en', 'cftoken': ''} + data = urlencode(params).encode('utf-8') + req_post = request.Request(api_url, data=data, headers=headers, method='POST') + with opener.open(req_post) as response: + res_data = json.loads(response.read().decode('utf-8')) + if res_data.get('status') == 'ok': + html = res_data.get('data', '') + match = re.search(r'href="(https?://dl\.snapcdn\.app/[^"]+)"', html) + if match: return match.group(1) + except Exception: + log.warning("Failed to fetch Twitter download link", exc_info=True) + return None + +def get_instagram_download_link(insta_url): + cj = cookiejar.CookieJar() + opener = request.build_opener(request.HTTPCookieProcessor(cj)) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36', + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': 'https://anon-viewer.com/', + 'Accept': '*/*' + } + opener.addheaders = list(headers.items()) + try: + opener.open("https://anon-viewer.com/", timeout=30) + + if "/stories/" in insta_url: + parts = insta_url.split("/") + username = parts[parts.index("stories") + 1] + api_url = f"https://anon-viewer.com/content.php?url={username}&method=allstories" + else: + encoded_url = quote(insta_url, safe='') + api_url = f"https://anon-viewer.com/content.php?url={encoded_url}" + + response = opener.open(api_url, timeout=60) + if response.getcode() == 200: + res_content = response.read().decode('utf-8') + data = json.loads(res_content) + html_text = data.get('html', '') + + match = re.search(r'href="([^"]+anon-viewer\.com/media\.php\?media=[^"]+)"', html_text) + if match: + return match.group(1).replace('&', '&') + + source_match = re.search(r' 0: + result_parts = [x[0] for x in data[0] if x[0]] + return "".join(result_parts) + except Exception as e: + log.error(f"Google Translate Failed: {e}", exc_info=True) + return text + return text + +class GeminiHandler: + _working_key_idx = 0 + _file_uri_keys = {} + _max_retries = 5 + + @staticmethod + def _get_api_keys(): + raw = config.conf["VisionAssistant"]["api_key"] + clean_raw = raw.replace('\r\n', ',').replace('\n', ',') + return [k.strip() for k in clean_raw.split(',') if k.strip()] + + @staticmethod + def _get_opener(): + return get_proxy_opener() + + @staticmethod + def _handle_error(e): + if hasattr(e, 'code'): + # Translators: Error message for Bad Request (400) + if e.code == 400: return _("Error 400: Bad Request (Check API Key)") + # Translators: Error message for Forbidden (403) + if e.code == 403: return _("Error 403: Forbidden (Check Region)") + if e.code == 429: return "QUOTA_EXCEEDED" + if e.code >= 500: return "SERVER_ERROR" + return str(e) + + @staticmethod + def _call_with_retry(func_logic, key, *args): + last_exc = None + for attempt in range(GeminiHandler._max_retries): + try: + return func_logic(key, *args) + except error.HTTPError as e: + err_msg = GeminiHandler._handle_error(e) + if err_msg not in ["QUOTA_EXCEEDED", "SERVER_ERROR"]: + raise + last_exc = e + except error.URLError as e: + last_exc = e + if attempt < GeminiHandler._max_retries - 1: + time.sleep(0.5 * (attempt + 1)) + raise last_exc + + @staticmethod + def _register_file_uri(uri, key): + if uri and key: + GeminiHandler._file_uri_keys[uri] = key + while len(GeminiHandler._file_uri_keys) > 200: + GeminiHandler._file_uri_keys.pop(next(iter(GeminiHandler._file_uri_keys))) + + @staticmethod + def _get_registered_key(uri): + if not uri: + return None + return GeminiHandler._file_uri_keys.get(uri) + + @staticmethod + def _call_with_key(func_logic, key, *args): + try: + return GeminiHandler._call_with_retry(func_logic, key, *args) + except error.HTTPError as e: + err_msg = GeminiHandler._handle_error(e) + if err_msg == "QUOTA_EXCEEDED": + # Translators: Message of a dialog which may pop up while performing an AI call + err_msg = _("Error 429: Quota Exceeded (Try later)") + elif err_msg == "SERVER_ERROR": + # Translators: Message of a dialog which may pop up while performing an AI call + err_msg = _("Server Error {code}: {reason}").format(code=e.code, reason=e.reason) + return "ERROR:" + err_msg + except Exception as e: + return "ERROR:" + str(e) + + @staticmethod + def _call_with_rotation(func_logic, *args): + keys = GeminiHandler._get_api_keys() + if not keys: + # Translators: Error when no API keys are found in settings + return "ERROR:" + _("No API Keys configured.") + + num_keys = len(keys) + for i in range(num_keys): + idx = (GeminiHandler._working_key_idx + i) % num_keys + key = keys[idx] + try: + res = GeminiHandler._call_with_retry(func_logic, key, *args) + GeminiHandler._working_key_idx = idx + return res + except error.HTTPError as e: + err_msg = GeminiHandler._handle_error(e) + if err_msg in ["QUOTA_EXCEEDED", "SERVER_ERROR"]: + if i < num_keys - 1: continue + # Translators: Error when all available API keys fail + return "ERROR:" + _("All API Keys failed (Quota/Server).") + return "ERROR:" + err_msg + except Exception as e: + return "ERROR:" + str(e) + return "ERROR:" + _("Unknown error occurred.") + + @staticmethod + def translate(text, target_lang): + def _logic(key, txt, lang): + model = config.conf["VisionAssistant"]["model_name"] + url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent" + quick_template = get_prompt_text("translate_quick") or "Translate to {target_lang}. Output ONLY translation." + quick_prompt = apply_prompt_template(quick_template, [("target_lang", lang)]) + payload = {"contents": [{"parts": [{"text": quick_prompt}, {"text": txt}]}]} + req = request.Request(url, data=json.dumps(payload).encode('utf-8'), headers={"Content-Type": "application/json", "x-goog-api-key": key}) + with GeminiHandler._get_opener().open(req, timeout=90) as r: + return json.loads(r.read().decode())['candidates'][0]['content']['parts'][0]['text'] + return GeminiHandler._call_with_rotation(_logic, text, target_lang) + + @staticmethod + def ocr_page(image_bytes): + def _logic(key, img_data): + model = config.conf["VisionAssistant"]["model_name"] + url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent" + ocr_image_prompt = get_prompt_text("ocr_image_extract") + payload = {"contents": [{"parts": [{"inline_data": {"mime_type": "image/jpeg", "data": base64.b64encode(img_data).decode('utf-8')}}, {"text": ocr_image_prompt}]}]} + req = request.Request(url, data=json.dumps(payload).encode('utf-8'), headers={"Content-Type": "application/json", "x-goog-api-key": key}) + with GeminiHandler._get_opener().open(req, timeout=120) as r: + return json.loads(r.read().decode())['candidates'][0]['content']['parts'][0]['text'] + return GeminiHandler._call_with_rotation(_logic, image_bytes) + + @staticmethod + def upload_and_process_batch(file_path, mime_type, page_count): + keys = GeminiHandler._get_api_keys() + if not keys: + # Translators: Error message for missing API Keys + return [ "ERROR:" + _("No API Keys.") ] + model = config.conf["VisionAssistant"]["model_name"] + + opener = GeminiHandler._get_opener() + proxy_url = config.conf["VisionAssistant"]["proxy_url"].strip() + base_url = proxy_url.rstrip('/') if proxy_url else "https://generativelanguage.googleapis.com" + + for i, key in enumerate(keys): + try: + f_size = os.path.getsize(file_path) + init_url = f"{base_url}/upload/v1beta/files" + headers = {"X-Goog-Upload-Protocol": "resumable", "X-Goog-Upload-Command": "start", "X-Goog-Upload-Header-Content-Length": str(f_size), "X-Goog-Upload-Header-Content-Type": mime_type, "Content-Type": "application/json", "x-goog-api-key": key} + + req = request.Request(init_url, data=json.dumps({"file": {"display_name": "batch"}}).encode(), headers=headers, method="POST") + with opener.open(req, timeout=120) as r: upload_url = r.headers.get("x-goog-upload-url") + + with open(file_path, 'rb') as f: f_data = f.read() + req_up = request.Request(upload_url, data=f_data, headers={"Content-Length": str(f_size), "X-Goog-Upload-Offset": "0", "X-Goog-Upload-Command": "upload, finalize"}, method="POST") + with opener.open(req_up, timeout=180) as r: + res = json.loads(r.read().decode()) + uri, name = res['file']['uri'], res['file']['name'] + + active = False + for attempt in range(30): + req_check = request.Request(f"{base_url}/v1beta/{name}", headers={"x-goog-api-key": key}) + with opener.open(req_check, timeout=30) as r: + state = json.loads(r.read().decode()).get('state') + if state == "ACTIVE": + active = True + break + if state == "FAILED": + break + time.sleep(2) + + if not active: + if i < len(keys) - 1: + continue + return [ "ERROR:" + _("Upload failed.") ] + + GeminiHandler._register_file_uri(uri, key) + + url = f"{base_url}/v1beta/models/{model}:generateContent" + prompt = get_prompt_text("ocr_document_extract") + contents = [{"parts": [{"file_data": {"mime_type": mime_type, "file_uri": uri}}, {"text": prompt}]}] + + req_gen = request.Request(url, data=json.dumps({"contents": contents}).encode(), headers={"Content-Type": "application/json", "x-goog-api-key": key}) + with opener.open(req_gen, timeout=180) as r: + res = json.loads(r.read().decode()) + text = res['candidates'][0]['content']['parts'][0]['text'] + return text.split('[[[PAGE_SEP]]]') + + except error.HTTPError as e: + err_code = GeminiHandler._handle_error(e) + if err_code in ["QUOTA_EXCEEDED", "SERVER_ERROR"] and i < len(keys) - 1: + continue + if err_code == "QUOTA_EXCEEDED": + # Translators: Message of a dialog which may pop up while performing an AI call + err_msg = _("Error 429: Quota Exceeded (Try later)") + elif err_code == "SERVER_ERROR": + # Translators: Message of a dialog which may pop up while performing an AI call + err_msg = _("Server Error {code}: {reason}").format(code=e.code, reason=e.reason) + else: + err_msg = err_code + return ["ERROR:" + err_msg] + except Exception as e: + return ["ERROR:" + str(e)] + return ["ERROR:" + _("All keys failed.")] + + @staticmethod + def chat(history, new_msg, file_uri, mime_type): + def _logic(key, hist, msg, uri, mime): + model = config.conf["VisionAssistant"]["model_name"] + proxy_url = config.conf["VisionAssistant"]["proxy_url"].strip() + base_url = proxy_url.rstrip('/') if proxy_url else "https://generativelanguage.googleapis.com" + url = f"{base_url}/v1beta/models/{model}:generateContent" + + contents = list(hist) + if uri: + user_parts = [{"file_data": {"mime_type": mime, "file_uri": uri}}] + else: + user_parts = [] + user_parts.append({"text": msg}) + contents.append({"role": "user", "parts": user_parts}) + + req = request.Request(url, data=json.dumps({"contents": contents}).encode(), headers={"Content-Type": "application/json", "x-goog-api-key": key}) + with GeminiHandler._get_opener().open(req, timeout=120) as r: + return json.loads(r.read().decode())['candidates'][0]['content']['parts'][0]['text'] + forced_key = GeminiHandler._get_registered_key(file_uri) if file_uri else None + if forced_key: + return GeminiHandler._call_with_key(_logic, forced_key, history, new_msg, file_uri, mime_type) + return GeminiHandler._call_with_rotation(_logic, history, new_msg, file_uri, mime_type) + + @staticmethod + def upload_for_chat(file_path, mime_type): + keys = GeminiHandler._get_api_keys() + if not keys: return None + opener = GeminiHandler._get_opener() + proxy_url = config.conf["VisionAssistant"]["proxy_url"].strip() + base_url = proxy_url.rstrip('/') if proxy_url else "https://generativelanguage.googleapis.com" + + for key in keys: + try: + f_size = os.path.getsize(file_path) + init_url = f"{base_url}/upload/v1beta/files" + headers = {"X-Goog-Upload-Protocol": "resumable", "X-Goog-Upload-Command": "start", "X-Goog-Upload-Header-Content-Length": str(f_size), "X-Goog-Upload-Header-Content-Type": mime_type, "Content-Type": "application/json", "x-goog-api-key": key} + req = request.Request(init_url, data=json.dumps({"file": {"display_name": os.path.basename(file_path)}}).encode(), headers=headers, method="POST") + with opener.open(req, timeout=120) as r: upload_url = r.headers.get("x-goog-upload-url") + with open(file_path, 'rb') as f: f_data = f.read() + req_up = request.Request(upload_url, data=f_data, headers={"Content-Length": str(f_size), "X-Goog-Upload-Offset": "0", "X-Goog-Upload-Command": "upload, finalize"}, method="POST") + with opener.open(req_up, timeout=180) as r: + res = json.loads(r.read().decode()) + uri, name = res['file']['uri'], res['file']['name'] + for attempt in range(30): + req_check = request.Request(f"{base_url}/v1beta/{name}", headers={"x-goog-api-key": key}) + with opener.open(req_check, timeout=30) as r: + state = json.loads(r.read().decode()).get('state') + if state == "ACTIVE": + GeminiHandler._register_file_uri(uri, key) + return uri + time.sleep(2) + return None + except Exception: + log.debug("Failed to upload file for chat with current key", exc_info=True) + continue + return None + + @staticmethod + def generate_speech(text, voice_name): + def _logic(key, txt, voice): + main_model = config.conf["VisionAssistant"]["model_name"] + if "pro" in main_model.lower(): + tts_model = "gemini-2.5-pro-preview-tts" + else: + tts_model = "gemini-2.5-flash-preview-tts" + + proxy_url = config.conf["VisionAssistant"]["proxy_url"].strip() + base_url = proxy_url.rstrip('/') if proxy_url else "https://generativelanguage.googleapis.com" + url = f"{base_url}/v1beta/models/{tts_model}:generateContent" + + payload = { + "contents": [{"parts": [{"text": txt}]}], + "generationConfig": { + "responseModalities": ["AUDIO"], + "speechConfig": {"voiceConfig": {"prebuiltVoiceConfig": {"voiceName": voice}}} + } + } + req = request.Request(url, data=json.dumps(payload).encode('utf-8'), headers={"Content-Type": "application/json", "x-goog-api-key": key}) + with GeminiHandler._get_opener().open(req, timeout=600) as r: + res = json.loads(r.read().decode()) + candidates = res.get('candidates', []) + if not candidates: raise Exception("No candidates returned") + content = candidates[0].get('content', {}) + parts = content.get('parts', []) + if not parts: raise Exception("No parts in response") + part = parts[0] + if 'inlineData' in part: return part['inlineData']['data'] + if 'inline_data' in part: return part['inline_data']['data'] + if 'text' in part: raise Exception(f"Model refused audio: {part['text']}") + raise Exception("Unknown response format") + return GeminiHandler._call_with_rotation(_logic, text, voice_name) diff --git a/addon/globalPlugins/visionAssistant/updater.py b/addon/globalPlugins/visionAssistant/updater.py new file mode 100644 index 0000000..16e899d --- /dev/null +++ b/addon/globalPlugins/visionAssistant/updater.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +import json +import threading +import logging +import os +import re +import tempfile + +import wx +from urllib import request + +import addonHandler +import gui +import ui +from .constants import ADDON_NAME +from .markdown_utils import clean_markdown +from .services import show_error_dialog + +log = logging.getLogger(__name__) +addonHandler.initTranslation() + +class UpdateDialog(wx.Dialog): + def __init__(self, parent, version, name, changes): + # Translators: Title of update confirmation dialog + super().__init__(parent, title=_("Update Available"), size=(500, 450)) + self.Centre() + + panel = wx.Panel(self) + vbox = wx.BoxSizer(wx.VERTICAL) + + # Translators: Message asking user to update. {version} is version number. + msg = _("A new version ({version}) of {name} is available.").format(version=version, name=name) + header = wx.StaticText(panel, label=msg) + vbox.Add(header, 0, wx.ALL, 15) + + # Translators: Label for the changes text box + change_lbl = wx.StaticText(panel, label=_("Changes:")) + vbox.Add(change_lbl, 0, wx.LEFT | wx.RIGHT, 15) + + self.changes_ctrl = wx.TextCtrl(panel, value=changes, style=wx.TE_MULTILINE | wx.TE_READONLY | wx.TE_RICH2) + vbox.Add(self.changes_ctrl, 1, wx.EXPAND | wx.ALL, 15) + + # Translators: Question to download and install + question = wx.StaticText(panel, label=_("Download and Install?")) + vbox.Add(question, 0, wx.LEFT | wx.RIGHT | wx.BOTTOM, 15) + + btn_sizer = wx.BoxSizer(wx.HORIZONTAL) + # Translators: Button to accept update + self.yes_btn = wx.Button(panel, wx.ID_YES, label=_("&Yes")) + # Translators: Button to reject update + self.no_btn = wx.Button(panel, wx.ID_NO, label=_("&No")) + + btn_sizer.Add(self.yes_btn, 0, wx.RIGHT, 10) + btn_sizer.Add(self.no_btn, 0) + vbox.Add(btn_sizer, 0, wx.ALIGN_RIGHT | wx.ALL, 15) + + panel.SetSizer(vbox) + self.yes_btn.SetDefault() + self.yes_btn.Bind(wx.EVT_BUTTON, lambda e: self.EndModal(wx.ID_YES)) + self.no_btn.Bind(wx.EVT_BUTTON, lambda e: self.EndModal(wx.ID_NO)) + +class UpdateManager: + def __init__(self, repo_name): + self.repo_name = repo_name + self.current_version = addonHandler.getCodeAddon().manifest['version'] + + def check_for_updates(self, silent=True): + threading.Thread(target=self._check_thread, args=(silent,), daemon=True).start() + + def _check_thread(self, silent): + try: + url = f"https://api.github.com/repos/{self.repo_name}/releases/latest" + req = request.Request(url, headers={"User-Agent": "NVDA-Addon"}) + with request.urlopen(req, timeout=60) as response: + if response.status == 200: + data = json.loads(response.read().decode('utf-8')) + latest_tag = data.get("tag_name", "").lstrip("v") + if self._compare_versions(latest_tag, self.current_version) > 0: + download_url = None + for asset in data.get("assets", []): + if asset["name"].endswith(".nvda-addon"): + download_url = asset["browser_download_url"] + break + if download_url: + raw_changes = data.get("body", "") + + clean_changes = re.split(r'SHA256|Checklist|---', raw_changes, flags=re.I)[0].strip() + clean_changes = clean_markdown(clean_changes) + + wx.CallAfter(self._prompt_update, latest_tag, download_url, clean_changes) + elif not silent: + # Translators: Error message when an update is found but the addon file is missing from GitHub. + msg = _("Update found but no .nvda-addon file in release.") + show_error_dialog(msg) + elif not silent: + # Translators: Status message informing the user they are already on the latest version. + msg = _("You have the latest version.") + wx.CallAfter(ui.message, msg) + except Exception as e: + if not silent: + msg = _("Update check failed: {error}").format(error=e) + show_error_dialog(msg) + + def _compare_versions(self, v1, v2): + try: + parts1 = [int(x) for x in v1.split('.')] + parts2 = [int(x) for x in v2.split('.')] + return (parts1 > parts2) - (parts1 < parts2) + except Exception: + return 0 if v1 == v2 else 1 + + def _prompt_update(self, version, url, changes): + dlg = UpdateDialog(gui.mainFrame, version, ADDON_NAME, changes) + if dlg.ShowModal() == wx.ID_YES: + threading.Thread(target=self._download_install_worker, args=(url,), daemon=True).start() + dlg.Destroy() + + def _download_install_worker(self, url): + try: + # Translators: Message shown while downloading update + msg = _("Downloading update...") + wx.CallAfter(ui.message, msg) + temp_dir = tempfile.gettempdir() + file_path = os.path.join(temp_dir, "VisionAssistant_Update.nvda-addon") + with request.urlopen(url) as response, open(file_path, 'wb') as out_file: + out_file.write(response.read()) + wx.CallAfter(os.startfile, file_path) + except Exception as e: + # Translators: Error message for download failure + msg = _("Download failed: {error}").format(error=e) + show_error_dialog(msg) + +# --- UI Classes ---