diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
index 15be72455..065fb8a94 100644
--- a/vlmeval/dataset/__init__.py
+++ b/vlmeval/dataset/__init__.py
@@ -14,7 +14,7 @@
ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, LLaVABench_KO, VGRPBench, MMVet, MTVQADataset,
TableVQABench, CustomVQADataset, CRPE, MathVerse, OlympiadBench, SeePhys, QSpatial, VizWiz, MMNIAH, LogicVista,
MME_CoT, MMSci_Captioning, Physics_yale, TDBenchGrounding, WildDocBenchmark, OCR_Reasoning, PhyX, CountBenchQA,
- ZEROBench, Omni3DBench, TallyQA, MMEReasoning, MMVMBench, BMMR, OCRBench_v2, AyaVisionBench, MathCanvas, MMReason
+ ZEROBench, Omni3DBench, TallyQA, MMEReasoning, MMVMBench, BMMR, OCRBench_v2, AyaVisionBench, MathCanvas, MMReason,ScienceOlympiad,Galaxy10DECaLS,VRSBench
)
from .image_ccocr import CCOCRDataset
@@ -231,7 +231,7 @@ def evaluate(self, eval_file, **judge_kwargs):
AyaVisionBench, TopViewRS, VLMBias, MMHELIX, MedqbenchMCQDataset, MathCanvas, MMReason,
MedqbenchPairedDescriptionDataset, MedqbenchCaptionDataset, ChartMuseum, ChartQAPro, ReasonMap_Plus,
olmOCRBench, OceanOCRBench, MATBench, VLRMBench, RefCOCODataset, SimpleVQA, HiPhODataset, MaCBench,
- UniSVG, SArena_MINI, MMSIVideoBench,
+ UniSVG, SArena_MINI, MMSIVideoBench,ScienceOlympiad,Galaxy10DECaLS,VRSBench
]
VIDEO_DATASET = [
diff --git a/vlmeval/dataset/image_vqa.py b/vlmeval/dataset/image_vqa.py
index 23aefb946..9a3d3a8e9 100644
--- a/vlmeval/dataset/image_vqa.py
+++ b/vlmeval/dataset/image_vqa.py
@@ -3813,3 +3813,303 @@ def evaluate(self, eval_file, **judge_kwargs):
score_pth = storage_score.replace('.xlsx', '_score.csv')
dump(score, score_pth)
return score
+
+class ScienceOlympiad(ImageBaseDataset):
+ TYPE = 'VQA'
+ DATASET_URL = {
+ 'ScienceOlympiad': 'LMUData/ScienceOlympiad.tsv',
+ }
+ DATASET_MD5 = {
+ 'ScienceOlympiad': '54b8d3086958cc294a8122b178a7ae5e',
+ }
+ def __init__(self, dataset='ScienceOlympiad', skip_noimg=False):
+ ROOT = LMUDataRoot()
+ # You can override this variable to save image files to a different directory
+ self.dataset_name = dataset
+ self.img_root = osp.join(ROOT, 'images', 'ScienceOlympiad')
+
+ data = self.load_data(dataset)
+ self.skip_noimg = skip_noimg
+ data['index'] = [str(x) for x in data['index']]
+ self.meta_only = False
+ if np.all([istype(x, int) for x in data['index']]):
+ data['index'] = [int(x) for x in data['index']]
+ self.data = data
+ self.post_build(dataset)
+
+ def build_prompt(self, line):
+ if isinstance(line, int):
+ line = self.data.iloc[line]
+
+ if pd.isna(line['image']):
+ tgt_path = None
+ else:
+ if self.meta_only:
+ tgt_path = toliststr(line['image'])
+ else:
+ tgt_path = self.dump_image(line)
+ instruction = (
+ f"{line['question']}\n"
+ "请一步步推理,并把你的最终答案放入\\boxed{}。")
+ msgs = []
+ if tgt_path is not None:
+ if isinstance(tgt_path, list):
+ msgs.extend([{"type": "image", "value": p} for p in tgt_path])
+ else:
+ msgs.append({"type": "image", "value": tgt_path})
+
+ msgs.append({"type": "text", "value": instruction})
+
+ return msgs
+
+ def evaluate(self, eval_file, **judge_kwargs):
+ from .utils.scienceolympiad import ScienceOlympiad_acc, ScienceOlympiad_auxeval
+
+ if 'LOCAL_LLM' in os.environ:
+ model = os.path.basename(os.environ.get('LOCAL_LLM'))
+ print(f'Using local model as judge model for ScienceOlympiad: {model}')
+ else:
+ model = judge_kwargs.setdefault('model', 'gpt-4o')
+ storage = get_intermediate_file_path(eval_file, f'_{model}_evaluation')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}_evaluation', 'pkl')
+ nproc = judge_kwargs.pop('nproc', 4)
+
+ meta = self.data
+ prediction_data = load(eval_file)
+ if eval_file.endswith('.json'):
+ prediction_data = pd.DataFrame(prediction_data)
+
+ if not osp.exists(storage):
+ data = prediction_data.copy()
+ data = data.sort_values(by='index')
+ judge_kwargs['max_tokens'] = 4096
+ model = build_judge(**judge_kwargs)
+ assert model.working(), 'ScienceOlympiad evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
+
+ lt = len(data)
+ lines = [data.iloc[i] for i in range(lt)]
+ tups = [(model, line) for line in lines]
+ indices = [line['index'] for line in lines]
+
+ ans = {}
+ if osp.exists(tmp_file):
+ ans = load(tmp_file)
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
+ indices = [i for i in indices if i not in ans]
+
+ if len(indices):
+ new_results = track_progress_rich(
+ ScienceOlympiad_auxeval,
+ tups,
+ nproc=nproc,
+ chunksize=nproc,
+ keys=indices,
+ save=tmp_file,
+ )
+ ans = load(tmp_file)
+ for k, v in zip(indices, new_results):
+ assert k in ans
+ assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+
+ data['result'] = [ans[idx]['res'] for idx in data['index']]
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
+ dump(data, storage)
+
+ score = ScienceOlympiad_acc(storage)
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
+ dump(score, score_pth)
+
+ return score
+
+class Galaxy10DECaLS(ImageBaseDataset):
+ TYPE = 'VQA'
+ DATASET_URL = {
+ 'Galaxy10DECaLS': 'LMUData/Galaxy10DECaLS.tsv',
+ }
+ DATASET_MD5 = {
+ 'Galaxy10DECaLS': 'a3379754c74bc5502de82d11bc837ed3',
+ }
+ def __init__(self, dataset='Galaxy10DECaLS', skip_noimg=False):
+ ROOT = LMUDataRoot()
+ # You can override this variable to save image files to a different directory
+ self.dataset_name = dataset
+ self.img_root = osp.join(ROOT, 'images', 'Galaxy10DECaLS')
+
+ data = self.load_data(dataset)
+ self.skip_noimg = skip_noimg
+ data['index'] = [str(x) for x in data['index']]
+ self.meta_only = False
+ if np.all([istype(x, int) for x in data['index']]):
+ data['index'] = [int(x) for x in data['index']]
+ self.data = data
+ self.post_build(dataset)
+
+ def build_prompt(self, line):
+ if isinstance(line, int):
+ line = self.data.iloc[line]
+
+ if pd.isna(line['image']):
+ tgt_path = None
+ else:
+ if self.meta_only:
+ tgt_path = toliststr(line['image'])
+ else:
+ tgt_path = self.dump_image(line)
+
+ instruction = f"{line['question']}"
+ msgs = []
+ if tgt_path is not None:
+ if isinstance(tgt_path, list):
+ msgs.extend([{"type": "image", "value": p} for p in tgt_path])
+ else:
+ msgs.append({"type": "image", "value": tgt_path})
+ msgs.append({"type": "text", "value": instruction})
+ return msgs
+
+
+ def evaluate(self, eval_file, **judge_kwargs):
+ from .utils.galaxy10_decals import Galaxy10DECaLS_acc, Galaxy10DECaLS_auxeval
+
+ storage = get_intermediate_file_path(eval_file, '_evaluation')
+ tmp_file = get_intermediate_file_path(eval_file, '_evaluation', 'pkl')
+ nproc = judge_kwargs.pop('nproc', 4)
+
+ meta = self.data
+ prediction_data = load(eval_file)
+ if eval_file.endswith('.json'):
+ prediction_data = pd.DataFrame(prediction_data)
+
+ if not osp.exists(storage):
+ data = prediction_data.copy()
+ data = data.sort_values(by='index')
+ lt = len(data)
+ lines = [data.iloc[i] for i in range(lt)]
+ tups = [( line) for line in lines]
+ indices = [line['index'] for line in lines]
+
+ ans = {}
+ if osp.exists(tmp_file):
+ ans = load(tmp_file)
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
+ indices = [i for i in indices if i not in ans]
+
+ if len(indices):
+ new_results = track_progress_rich(
+ Galaxy10DECaLS_auxeval,
+ tups,
+ nproc=nproc,
+ chunksize=nproc,
+ keys=indices,
+ save=tmp_file,
+ )
+ ans = load(tmp_file)
+ for k, v in zip(indices, new_results):
+ assert k in ans
+ assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+
+ data['result'] = [ans[idx]['res'] for idx in data['index']]
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
+ dump(data, storage)
+
+ score = Galaxy10DECaLS_acc(storage)
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
+ dump(score, score_pth)
+
+ return score
+
+class VRSBench(ImageBaseDataset):
+ TYPE = 'VQA'
+ DATASET_URL = {
+ 'VRSBench': 'LMUData/VRSBench.tsv',
+ 'VRSBench_MINI': 'LMUData/VRSBench_MINI.tsv',
+ }
+ DATASET_MD5 = {
+ 'VRSBench': 'b6ba8c741e36b4d4b56793361a01c486',
+ 'VRSBench_MINI': '7ddb6e0647db9b8623e83957b6457b1e',
+ }
+ def __init__(self, dataset='VRSBench_MINI', skip_noimg=False):
+ ROOT = LMUDataRoot()
+ # You can override this variable to save image files to a different directory
+ self.dataset_name = dataset
+ self.img_root = osp.join(ROOT, 'images', dataset)
+ self.meta_only = False
+ data = self.load_data(dataset)
+ self.skip_noimg = skip_noimg
+ data['index'] = [str(x) for x in data['index']]
+ if np.all([istype(x, int) for x in data['index']]):
+ data['index'] = [int(x) for x in data['index']]
+ self.data = data
+ self.post_build(dataset)
+
+ def build_prompt(self, line):
+ if isinstance(line, int):
+ line = self.data.iloc[line]
+
+ if self.meta_only:
+ tgt_path = toliststr(osp.join(self.img_root,line['image_path']))
+ else:
+ tgt_path = self.dump_image(line)
+
+ instruction = f"{line['question']}"
+ msgs = []
+ if tgt_path is not None:
+ if isinstance(tgt_path, list):
+ msgs.extend([{"type": "image", "value": p} for p in tgt_path])
+ else:
+ msgs.append({"type": "image", "value": tgt_path})
+ msgs.append({"type": "text", "value": instruction})
+
+ return msgs
+
+ def evaluate(self, eval_file, **judge_kwargs):
+ from .utils.vrsbench import VRSBench_acc, VRSBench_auxeval
+
+ model = judge_kwargs['model']
+ storage = get_intermediate_file_path(eval_file, f'_{model}_evaluation')
+ tmp_file = get_intermediate_file_path(eval_file, f'_{model}_evaluation', 'pkl')
+ nproc = judge_kwargs.pop('nproc', 4)
+
+ meta = self.data
+ prediction_data = load(eval_file)
+ if eval_file.endswith('.json'):
+ prediction_data = pd.DataFrame(prediction_data)
+
+ if not osp.exists(storage):
+ data = prediction_data.copy()
+ data = data.sort_values(by='index')
+ lt = len(data)
+ model = build_judge(**judge_kwargs)
+ assert model.working(), 'VRSBench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE
+ lines = [data.iloc[i] for i in range(lt)]
+ tups = [(model,line) for line in lines]
+ indices = [line['index'] for line in lines]
+
+ ans = {}
+ if osp.exists(tmp_file):
+ ans = load(tmp_file)
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
+ indices = [i for i in indices if i not in ans]
+
+ if len(indices):
+ new_results = track_progress_rich(
+ VRSBench_auxeval,
+ tups,
+ nproc=nproc,
+ chunksize=nproc,
+ keys=indices,
+ save=tmp_file,
+ )
+ ans = load(tmp_file)
+ for k, v in zip(indices, new_results):
+ assert k in ans
+ assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
+
+ data['result'] = [ans[idx]['res'] for idx in data['index']]
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
+ dump(data, storage)
+
+ score = VRSBench_acc(storage)
+ score_pth = get_intermediate_file_path(storage, '_score', 'csv')
+ dump(score, score_pth)
+
+ return score
\ No newline at end of file
diff --git a/vlmeval/dataset/utils/galaxy10_decals.py b/vlmeval/dataset/utils/galaxy10_decals.py
new file mode 100644
index 000000000..e03e33067
--- /dev/null
+++ b/vlmeval/dataset/utils/galaxy10_decals.py
@@ -0,0 +1,129 @@
+import re
+from tqdm import tqdm
+from ...smp import *
+from collections import defaultdict
+
+
+def check_format(llm_output):
+ # check ... result
+ if '' not in llm_output or '' not in llm_output:
+ return 0.0
+ if llm_output.count('') != llm_output.count('') and llm_output.count('') != 1:
+ return 0.0
+ final_answer = llm_output.split('')[1]
+ if final_answer!=None and final_answer!='':
+ return 1.0
+ else:
+ return 0.0
+
+def remove_boxed(s):
+ if "\\boxed " in s:
+ left = "\\boxed "
+ assert s[: len(left)] == left
+ return s[len(left) :]
+
+ left = "\\boxed{"
+
+ assert s[: len(left)] == left
+ assert s[-1] == "}"
+
+ return s[len(left) : -1]
+
+def last_boxed_only_string(string):
+ idx = string.rfind("\\boxed")
+ if "\\boxed " in string:
+ return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+ if idx < 0:
+ idx = string.rfind("\\fbox")
+ if idx < 0:
+ return None
+
+ i = idx
+ right_brace_idx = None
+ num_left_braces_open = 0
+ while i < len(string):
+ if string[i] == "{":
+ num_left_braces_open += 1
+ if string[i] == "}":
+ num_left_braces_open -= 1
+ if num_left_braces_open == 0:
+ right_brace_idx = i
+ break
+ i += 1
+
+ retval = None if right_brace_idx is None else string[idx : right_brace_idx + 1]
+
+ return retval
+
+def Galaxy10DECaLS_auxeval(line):
+ label_dict = {
+ "Disturbed Galaxies": 0,
+ "Merging Galaxies": 1,
+ "Round Smooth Galaxies": 2,
+ "In-between Round Smooth Galaxies": 3,
+ "Cigar Shaped Smooth Galaxies": 4,
+ "Barred Spiral Galaxies": 5,
+ "Unbarred Tight Spiral Galaxies": 6,
+ "Unbarred Loose Spiral Galaxies": 7,
+ "Edge-on Galaxies without Bulge": 8,
+ "Edge-on Galaxies with Bulge": 9
+ }
+ log = ''
+ response = line['prediction']
+ if not response or not isinstance(response, str):
+ log += 'Invalid response format, returning False.'
+ return dict(log=log, res=False)
+
+ if '' in response:
+ final_ans = response.replace('\x08','\\b').split('')[1]
+ else:
+ final_ans = response.replace('\x08','\\b')
+
+ ground_truth = line['answer']
+ # print(final_ans, ground_truth)
+ if final_ans is None or not isinstance(final_ans, str) or final_ans.strip() == "":
+ log += 'Invalid final answer, returning False.'
+ return dict(log=log, res=False)
+ boxed_answer = last_boxed_only_string(final_ans)
+ if boxed_answer is None or boxed_answer == '':
+ log += 'Invalid extract answer, returning False.'
+ return dict(log=log, res=False)
+ else:
+ boxed_answer = remove_boxed(boxed_answer)
+ if boxed_answer in label_dict and label_dict[boxed_answer] == ground_truth:
+ log += 'Prefetch succeed. Rule evaluate Succeessfully.'
+ return dict(log=log, res=True)
+ else:
+ log += 'Boxed answer does not match ground truth.'
+ return dict(log=log, res=False)
+
+def Galaxy10DECaLS_acc(result_file):
+ data = load(result_file)
+ if result_file.endswith('.json'):
+ data = pd.DataFrame(data)
+ tot = defaultdict(int)
+ hit = defaultdict(int)
+ fetch = defaultdict(int)
+ lt = len(data)
+
+ for i in tqdm(range(lt)):
+ item = data.iloc[i]
+ tot['Overall'] += 1
+ if 'log' in item:
+ log_value = item['log']
+ else:
+ log_value = item['answer_match_log']
+ if 'Prefetch succeed' in log_value:
+ fetch['Overall'] += 1
+ if item.get('result'):
+ hit['Overall'] += 1
+
+ res = defaultdict(list)
+ for k in tot:
+ res['total'].append(tot[k])
+ # res['hit'].append(hit[k])
+ res['acc'].append(hit[k] / tot[k] * 100 if tot[k] else 0.0)
+ # res['rule_prefetch'].append(fetch[k])
+ res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+
+ return pd.DataFrame(res)
\ No newline at end of file
diff --git a/vlmeval/dataset/utils/scienceolympiad.py b/vlmeval/dataset/utils/scienceolympiad.py
new file mode 100644
index 000000000..1bc78614e
--- /dev/null
+++ b/vlmeval/dataset/utils/scienceolympiad.py
@@ -0,0 +1,181 @@
+import requests
+from math_verify import verify, parse
+import pandas as pd
+from tqdm import tqdm
+from ...smp import *
+from collections import defaultdict
+import logging
+
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect.
+
+# Input
+[Question]
+{question}
+
+[Reference Answer]
+{gold}
+
+[Predicted Answer]
+{pred}
+
+# Evaluation Rules
+- The predicted answer of the model may contain the reasoning process, you should spot the final answer from it.
+- Evaluate the model's answer based on correctness compared to the reference answer.
+- Ignore language differences: If the core meaning of the predicted answer (after extracting the final result) is consistent with the reference answer (even if one is in English and the other in Chinese), it is considered correct.
+- Formula/Chemical Expression Evaluation: For chemical formulas, chemical equations, or physical formulas, judge based on core consistency:
+ * Chemical formulas: Consistent elemental composition and valence (e.g., H2O and H₂O are equivalent, NaOH, sodium hydroxide, and 氢氧化钠 are equivalent if the reference/predicted uses name/formula respectively).
+ * Chemical equations: Consistent reactants, products, and balanced stoichiometry (ignore minor formatting differences like spaces or superscript/subscript display).
+ * Physical formulas: Consistent variables, mathematical relationships, and key constants (ignore formatting differences like parentheses or symbol case if the core logic is identical).
+- For questions with multiple minor issues, the predicted answer is determined to be correct only if it meets the reference answers for all minor issues; otherwise, it is considered incorrect.
+- Ignore minor differences in formatting, capitalization, or spacing since the model may explain in a different way.
+- Treat numerical answers as correct if they match within reasonable precision
+- For questions requiring units, both value and unit must be correct
+
+Grade the predicted answer of this new question as one of:
+A: CORRECT
+B: INCORRECT
+
+Just return the letters "A" or "B", with no text around it.
+"""
+
+def parse_boxed(final_ans, ground_truth):
+ ground_truth = "\\boxed{"+ ground_truth + "}"
+ pred_result= last_boxed_only_string(final_ans)
+ if pred_result is None:
+ return None, None
+
+ extract_ans = None
+ if pred_result.startswith("\\boxed{"):
+ extract_ans = pred_result[7:-1]
+ elif pred_result.startswith("\\fbox{"):
+ extract_ans = pred_result[6:-1]
+ elif pred_result.startswith("\\boxed "):
+ extract_ans = pred_result[7:]
+
+ is_pass = eval_math_verify(pred_result, ground_truth)
+ return is_pass, extract_ans
+
+def eval_math_verify(predicted, ground_truth):
+ predicted = parse(predicted, parsing_timeout=None)
+ ground_truth = parse(ground_truth, parsing_timeout=None)
+ is_correct = verify(predicted, ground_truth,timeout_seconds=None)
+ return is_correct
+
+def last_boxed_only_string(string):
+ idx = string.rfind("\\boxed")
+ if "\\boxed " in string:
+ return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+ if idx < 0:
+ idx = string.rfind("\\fbox")
+ if idx < 0:
+ return None
+
+ i = idx
+ right_brace_idx = None
+ num_left_braces_open = 0
+ while i < len(string):
+ if string[i] == "{":
+ num_left_braces_open += 1
+ if string[i] == "}":
+ num_left_braces_open -= 1
+ if num_left_braces_open == 0:
+ right_brace_idx = i
+ break
+ i += 1
+
+ retval = None if right_brace_idx is None else string[idx : right_brace_idx + 1]
+ return retval
+
+def ScienceOlympiad_auxeval(model, line):
+
+ log = ''
+ response = line['prediction']
+ if not response or not isinstance(response, str):
+ log += 'Invalid response format, returning False.'
+ return dict(log=log, res=False)
+
+ if '' in response:
+ final_ans = response.split('')[1]
+ else:
+ final_ans = response
+
+ gt = line['answer'].strip()
+ if final_ans is None or not isinstance(final_ans, str) or final_ans.strip() == "":
+ log += 'Invalid final answer, returning False.'
+ return dict(log=log, res=False)
+ is_pass, extract_ans = parse_boxed(final_ans, gt)
+ if is_pass == True:
+ log += 'Prefetch succeed. Math_verify evaluate Succeessfully.'
+ return dict(log=log, res=True)
+ # if extract_ans is None:
+ # log += 'Invalid extract answer, returning False.'
+ # return dict(log=log, res=False)
+
+ # llm judge
+ else:
+ question = line['question']
+ validation_prompt = DEFAULT_PROMPT_TEMPLATE.format(
+ question=question,
+ gold=gt,
+ pred=final_ans
+ )
+
+ retry = 5
+ for i in range(retry):
+ prediction = line['prediction']
+ res = model.generate(validation_prompt, temperature=0.1)#temperature=i * 0.5
+
+ if FAIL_MSG in res or res.strip() not in ['A', 'B']:
+ log += f'Try {i}: output is {prediction}, answer is {extract_ans}. judge_model response is {res}, failed to eval with judge_model.\n'
+ else:
+ log += 'Judge model evaluate Succeessfully.'
+ if res.strip() == 'A':
+ re_score = True
+ logging.info(f"Judge model evaluate Succeessfully. Response is: {res.strip()}")
+ else:
+ re_score = False
+ logging.info(f"Judge model evaluate Succeessfully. Response is: {res.strip()}")
+ return dict(log=log, res=re_score)
+ log += 'All 5 retries failed.\n'
+
+ return dict(log=log, res=False)
+
+def ScienceOlympiad_acc(result_file):
+ data = load(result_file)
+ if result_file.endswith('.json'):
+ data = pd.DataFrame(data)
+ tot = defaultdict(int)
+ hit = defaultdict(int)
+ fetch = defaultdict(int)
+ lt = len(data)
+
+ for i in tqdm(range(lt)):
+ item = data.iloc[i]
+ cate = item.get('category', 'Overall')
+
+ tot['Overall'] += 1
+ tot[cate] += 1
+ if 'log' in item:
+ log_value = item['log']
+ else:
+ log_value = item['answer_match_log']
+ if 'Prefetch succeed' in log_value:
+ fetch['Overall'] += 1
+ fetch[cate] += 1
+ if item.get('result'):
+ hit['Overall'] += 1
+ hit[cate] += 1
+
+ res = defaultdict(list)
+ for k in tot:
+ res['Subject'].append(k)
+ res['total'].append(tot[k])
+ # res['hit'].append(hit[k])
+ res['acc'].append(hit[k] / tot[k] * 100 if tot[k] else 0.0)
+ # res['rule_prefetch'].append(fetch[k])
+ res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+
+ return pd.DataFrame(res).sort_values('Subject', ignore_index=True)
\ No newline at end of file
diff --git a/vlmeval/dataset/utils/vrsbench.py b/vlmeval/dataset/utils/vrsbench.py
new file mode 100644
index 000000000..dda3ebd04
--- /dev/null
+++ b/vlmeval/dataset/utils/vrsbench.py
@@ -0,0 +1,208 @@
+import re
+import string
+from tqdm import tqdm
+from ...smp import *
+import pandas as pd
+from collections import defaultdict
+import logging
+
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+DEFAULT_PROMPT_TEMPLATE = """
+You are an evaluation model. Your task is to judge whether a predicted answer
+should be considered correct based on the question and ground truth.
+
+Be objective and strict.
+
+Question: {question}
+Ground Truth: {gold}
+Predicted Answer: {pred}
+
+Please output exactly one character:
+- Output '1' if the predicted answer should be considered correct.
+- Output '0' if the predicted answer should be considered incorrect.
+"""
+
+
+
+NEGATIVE_WORDS = ['not', 'no', 'never', "n't", 'without']
+WINDOW_SIZE = 3
+
+def standardize_text(text):
+ text = text.lower()
+ text = text.translate(str.maketrans('', '', string.punctuation))
+ text = ' '.join(text.split())
+ return text
+
+def check_negation_around_match(ground_truth, predicted):
+
+ standardized_gt = standardize_text(ground_truth)
+ gt_word_list = standardized_gt.split()
+
+ standardized_pred = standardize_text(predicted)
+ pred_word_list = standardized_pred.split()
+
+ if not gt_word_list:
+ return False
+
+ match_indices = []
+ for i in range(len(pred_word_list) - len(gt_word_list) + 1):
+ if pred_word_list[i:i + len(gt_word_list)] == gt_word_list:
+ match_indices.append(i)
+
+ if not match_indices:
+ return False
+
+ gt_len = len(gt_word_list)
+ for start_index in match_indices:
+ pre_start = max(0, start_index - WINDOW_SIZE)
+ for i in range(pre_start, start_index):
+ if pred_word_list[i] in NEGATIVE_WORDS:
+ return True
+
+ post_end = min(len(pred_word_list), start_index + gt_len + WINDOW_SIZE)
+ for i in range(start_index + gt_len, post_end):
+ if pred_word_list[i] in NEGATIVE_WORDS:
+ return True
+
+ return False
+
+
+def check_format(llm_output):
+ if '' not in llm_output or '' not in llm_output:
+ return 0.0
+
+ if llm_output.count('') != llm_output.count('') or llm_output.count('') != 1:
+ return 0.0
+
+ final_answer = llm_output.split('', 1)[1].strip()
+
+ if final_answer:
+ return 1.0
+ else:
+ return 0.0
+
+def VRSBench_auxeval(model, line):
+ """
+ 1. 子串匹配 + 否定词检查
+ 2. 'yes'/'no'/数字 精确匹配
+ 3. LLM 模型判断
+ """
+ log = ''
+ response = line['prediction']
+ gt = line['answer'].strip() if isinstance(line['answer'], str) else line['answer']
+ if not gt :
+ log += 'Invalid ground truth format, returning False.'
+ return dict(log=log, res=False)
+ if not response or not isinstance(response, str):
+ log += 'Invalid response format, returning False.'
+ return dict(log=log, res=False)
+
+ if '' in response:
+ final_ans = response.split('')[1].lower()
+ else:
+ final_ans = response.lower()
+ if final_ans is None or not isinstance(final_ans, str) or final_ans.strip() == "":
+ log += 'Invalid final answer, returning False.'
+ return dict(log=log, res=False)
+ # 1. ground_truth 是 predicted 的子串
+ if gt in final_ans:
+ if check_negation_around_match(gt, final_ans):
+ log += 'Decorated by a negative word, returning False.'
+ return dict(log=log, res=False)
+ else:
+ log += 'Prefetch succeed. Matching successfully.'
+ return dict(log=log, res=True)
+ # 2. ground_truth 是 'yes', 'no' 或数字,需要精确匹配
+ try:
+
+ gt_num = float(gt)
+ final_ans_num = float(final_ans)
+ if gt_num == final_ans_num:
+ log += 'Prefetch succeed. Matching successfully.'
+ return dict(log=log, res=True)
+ else:
+ log += 'Not a number, returning False.'
+ return dict(log=log, res=False)
+
+ except ValueError:
+ # 如果不是数字,则继续原始的 'yes'/'no' 判断
+ if gt in ['yes', 'no']:
+ if gt == final_ans:
+ log += 'Prefetch succeed. Matching successfully.'
+ return dict(log=log, res=True)
+ else:
+ log += 'Not a yes or no, returning False.'
+ return dict(log=log, res=False)
+ # 3. 复杂情况,调用 LLM 模型进行判断
+ else:
+ question = line['question']
+ validation_prompt = DEFAULT_PROMPT_TEMPLATE.format(
+ question=question,
+ gold=gt,
+ pred=final_ans
+ )
+
+ retry = 5
+ for i in range(retry):
+ prediction = line['prediction']
+ res = model.generate(validation_prompt, temperature=0.1)#temperature=i * 0.5
+ match = re.search(r'[01]', res)
+ if FAIL_MSG in res or not match:
+ log += f'Try {i}: output is {prediction}, answer is {final_ans}. judge_model response is {res}, failed to eval with judge_model.\n'
+ else:
+ log += 'Judge model evaluate Succeessfully.'
+ if match.group(0) == '1':
+ re_score = True
+ logging.info(f"Judge model evaluate Succeessfully. Response is: {res.strip()}")
+ else:
+ re_score = False
+ logging.info(f"Judge model evaluate Succeessfully. Response is: {res.strip()}")
+ return dict(log=log, res=re_score)
+ log += 'All 5 retries failed.\n'
+
+ return dict(log=log, res=False)
+
+def VRSBench_acc(result_file):
+ data = load(result_file)
+ if result_file.endswith('.json'):
+ data = pd.DataFrame(data)
+ tot = defaultdict(int)
+ hit = defaultdict(int)
+ fetch = defaultdict(int)
+ lt = len(data)
+
+ for i in tqdm(range(lt)):
+ item = data.iloc[i]
+ cate = item.get('category', 'Overall')
+
+ tot['Overall'] += 1
+ tot[cate] += 1
+ if 'log' in item:
+ log_value = item['log']
+ else:
+ log_value = item['answer_match_log']
+ if 'Prefetch succeed' in log_value:
+ fetch['Overall'] += 1
+ fetch[cate] += 1
+ if item.get('result'):
+ hit['Overall'] += 1
+ hit[cate] += 1
+
+ res = defaultdict(list)
+ for k in tot:
+ res['Subject'].append(k)
+ res['total'].append(tot[k])
+ # res['hit'].append(hit[k])
+ res['acc'].append(hit[k] / tot[k] * 100 if tot[k] else 0.0)
+ # res['rule_prefetch'].append(fetch[k])
+ res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+
+ return pd.DataFrame(res).sort_values('Subject', ignore_index=True)
+
+
+
+
+