diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 15be72455..a5b46f59b 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -101,7 +101,7 @@ from .macbench import MaCBench from .sarena_mini import SArena_MINI from .uni_svg import UniSVG - +from .vladbench import VLADBench class ConcatDataset(ImageBaseDataset): # This dataset takes multiple dataset names as input and aggregate them into a single dataset. @@ -231,7 +231,7 @@ def evaluate(self, eval_file, **judge_kwargs): AyaVisionBench, TopViewRS, VLMBias, MMHELIX, MedqbenchMCQDataset, MathCanvas, MMReason, MedqbenchPairedDescriptionDataset, MedqbenchCaptionDataset, ChartMuseum, ChartQAPro, ReasonMap_Plus, olmOCRBench, OceanOCRBench, MATBench, VLRMBench, RefCOCODataset, SimpleVQA, HiPhODataset, MaCBench, - UniSVG, SArena_MINI, MMSIVideoBench, + UniSVG, SArena_MINI, MMSIVideoBench,VLADBench, ] VIDEO_DATASET = [ diff --git a/vlmeval/dataset/utils/vladbench.py b/vlmeval/dataset/utils/vladbench.py new file mode 100644 index 000000000..22871dc6f --- /dev/null +++ b/vlmeval/dataset/utils/vladbench.py @@ -0,0 +1,424 @@ +import os +import json +import re +import numpy as np +from vlmeval.smp import misc + +def weighted_row_sum(data, third_rows, weight_col=1, start_col=2): + + data = np.array(data) + m,n = data.shape + rows = slice(m-third_rows, m) + cols = slice(start_col, None) + weighted_sum = np.sum(data[rows, cols].astype(float) * data[rows, weight_col].astype(float)[:, np.newaxis], axis=0) / np.sum(data[rows, weight_col].astype(float)) + weighted_sum = ['Mean',np.sum(data[rows, weight_col].astype(float))] + weighted_sum.tolist() + temp = data.tolist() + temp.append(weighted_sum) + return temp + + + +def weighted_total(data, weight_col=1, start_col=2): + data = np.array(data) + m,n = data.shape + rows = slice(0, m) + cols = slice(start_col, None) + weighted_sum = np.sum(data[rows, cols].astype(float) * data[rows, weight_col].astype(float)[:, np.newaxis], axis=0) / np.sum(data[rows, weight_col].astype(float)) + weighted_sum = ['Total',np.sum(data[rows, weight_col].astype(float))] + weighted_sum.tolist() + return weighted_sum + + +def box_iou(boxA, boxB): + boxA = [int(x) for x in boxA] + boxB = [int(x) for x in boxB] + + xA = max(boxA[0], boxB[0]) + xB = min(boxA[2], boxB[2]) + yA = max(boxA[1], boxB[1]) + yB = min(boxA[3], boxB[3]) + + interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1) + + boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1) + boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1) + + iou = interArea / float(boxAArea + boxBArea - interArea) + + return iou +def clean_string(s): + while s and (s[0] in ":[]()' ."): + s = s[1:] + while s and (s[-1] in ":[]()' ."): + s = s[:-1] + return s + +def convert_if_number(answer): + if isinstance(answer, (int, float)): + return str(answer) + return answer + +def remove_symbols(input_string): + input_string = str(input_string) + if 'correct answer is:' in input_string: + input_string = input_string.split('correct answer is:')[-1] + cleaned_string = re.sub(r'[\*\n\""]', '', input_string) + return cleaned_string + +def extract_options(text): + + pattern = re.compile(r"\[([^\]]+)\]") + matches = pattern.findall(text) + + if matches: + option_string = matches[-1] + if "'" not in option_string: + option_list = option_string.split(", ") + else: + option_list = [item.strip().strip("'") for item in option_string.split("', '")] + return option_list + return [] + + +def compare_and_count(array_a, array_b): + count = 0 + for a, b in zip(array_a, array_b): + if a == 1 and b == 1: count+=1 + if a > b:count+=1 + return count + +def isfile(path): + return os.path.isfile(path) + + +def load_json_data(path): + with open(path,'r',encoding='utf-8') as json_f: + task_data = json.load(json_f) + return task_data + +def save_json_data(path,data): + with open(path,'w',encoding='utf-8') as json_f: + json.dump(data,json_f,ensure_ascii=False,indent=4) + +def Geneal_criterion_QA(third_task_data,MODEL=None): + ques_total_num = 0 + right_num = 0 + obey_insytruction = 0 + for d_ind, sample in enumerate(third_task_data): + reference = sample['reference'] + prediction = sample['prediction'] + for q_ind, pred in enumerate(prediction): + # print(sample['image_path']) + ques_nopath = sample['questions'][q_ind].lower() + tips = extract_options(ques_nopath) + # print(tips) + if len(tips)==0: pass + # if len(tips)!=0: print('No tips',sample['image_path']) + # print('No tips',sample['image_path']) + # print(ques_nopath) + pred = remove_symbols(pred) + ques_total_num += 1 + clean_pred = clean_string(pred).lower() + options_nums = clean_pred.split("', '") + reference_q_ind = convert_if_number(reference[q_ind]).lower() + if len(options_nums)==1: + if clean_pred in ques_nopath: + obey_insytruction+=1 + if clean_pred==reference_q_ind: + right_num+=1 + elif reference_q_ind in clean_pred: + ### filter + if reference_q_ind in tips: + tips.remove(reference_q_ind) + if not any(tip in clean_pred for tip in tips): + right_num+=1 + return ques_total_num,right_num/ques_total_num,obey_insytruction/ques_total_num,0 + + +def Grounding_criterion_QA(third_task_data,MODEL=None): + print('MODEL', MODEL) + if MODEL ==None: + print('MODEL Input Lacked') + return -1 + resize_model_lists = ["qwen", "internvl", "gemini","DriveMM",'ivl'] + ques_total_num = 0 + right_num = 0 + loc_union = [] + obey_insytruction = 0 + PATTERN = re.compile(r'\[\s*([^\],]*\d+[^\],]*)\s*,\s*([^\],]*\d+[^\],]*)\s*,\s*([^\],]*\d+[^\],]*)\s*,\s*([^\],]*\d+[^\],]*)\s*\]') + box_num = 0 + for d_ind, sample in enumerate(third_task_data): + reference = sample['reference'] + prediction = sample['prediction'] + for q_ind, pred in enumerate(prediction): + # print(sample['image_path']) + ques_total_num += 1 + ques_nopath = sample['questions'][q_ind].lower() + if 'located in the image?' in ques_nopath: + matches = PATTERN.findall(pred) + cleaned_matches = [[float(re.sub(r'[^0-9.]', '', part)) for part in match] for match in matches] + if len(matches)==1: + box_num+=1 + obey_insytruction+=1 + predict_bbox = cleaned_matches[0] + else: + predict_bbox = [0.0, 0.0, 0.0, 0.0] + + if sum(predict_bbox) <4: + predict_bbox = [x * 1000 for x in predict_bbox] + if any(mn.lower() in MODEL.lower() for mn in resize_model_lists): + bbox_gt = list(map(int, misc.toliststr(sample['reference'][q_ind]))) + width, height = sample['dimension'][q_ind] + width, height = float(width), float(height) + bbox_gt = [int(1000*bbox_gt[0]/width), int(1000*bbox_gt[1]/height), int(1000*bbox_gt[2]/width), int(1000*bbox_gt[3]/height)] + elif MODEL =="gemini": + bbox_gt = [bbox_gt[1], bbox_gt[0], bbox_gt[3], bbox_gt[2]] + else: + bbox_gt = sample['reference'][q_ind] + + iou = box_iou(predict_bbox, bbox_gt) + if iou > 0.5: right_num+=1 + loc_union.append(iou) + else: + tips = extract_options(ques_nopath) + # if len(tips)==0: + # print('No tips',sample['image_path']) + # print(sample['questions'][q_ind]) + pred = remove_symbols(pred) + clean_pred = clean_string(pred).lower() + options_nums = clean_pred.split("', '") + reference_q_ind = convert_if_number(reference[q_ind]).lower() + if len(options_nums)==1: + if clean_pred in ques_nopath: + obey_insytruction+=1 + if clean_pred==reference_q_ind: + right_num+=1 + + elif reference_q_ind in clean_pred: + ### filter + if reference_q_ind in tips: + tips.remove(reference_q_ind) + if not any(tip in clean_pred for tip in tips): + right_num+=1 + + mean_iou = sum(loc_union)/len(loc_union) + return ques_total_num, right_num/ques_total_num, obey_insytruction/ques_total_num, mean_iou + + + +def Relation_criterion_QA(third_task_data,MODEL=None): + ques_total_num = 0 + total_score = 0 + obey_insytruction = 0 + totol_improve_score = 0 + for d_ind, sample in enumerate(third_task_data): + reference = sample['reference'] + prediction = sample['prediction'] + scores_list = [] + for q_ind, pred in enumerate(prediction): + ques_total_num+=1 + if 'corresponds to' in pred: + # pattern = r'(? List[Dict[str, str]]: + """ + Build a prompt for the model from a data line. + + Args: + line: Either an index into the dataset or a pandas Series + + Returns: + List of message dictionaries containing the image and question + """ + if isinstance(line, int): + line = self.data.iloc[line] + + tgt_path = misc.toliststr(line["image"]) + question = line['question'] + # form messages + msgs = [] + if isinstance(tgt_path, list): + # for p in tgt_path: + # print(os.path.join(self.IMAGE_DIR, p)) + msgs.extend([dict(type='image', value=os.path.join(self.IMAGE_DIR, p)) for p in tgt_path]) + + else: + # print(os.path.join(self.IMAGE_DIR, tgt_path)) + msgs = [dict(type='image', value=os.path.join(self.IMAGE_DIR, tgt_path))] + msgs.append(dict(type='text', value=question)) + + return msgs + + def get_scores(self, result_file: str) -> pd.DataFrame: + data = file.load(result_file) + model_name = os.path.basename(result_file).split('_')[0] + + all_results = [] + total_results = [] + for fir_ind, fir_task in enumerate(all_tasks): + sec_tasks = all_tasks[fir_task] + for sec_ind, sec_task in enumerate(sec_tasks): + if sec_task=='Ego_trajectory_Planning':continue + third_tasks = sec_tasks[sec_task] + third_rows = 0 + for third_ind, third_task in enumerate(third_tasks): + # filter samples of third task + filter_data = data[data['category3']==third_task] + # prepare data structure for evaluation: list(dict(list)) + third_task_data = [] + same_vision_qas = {'reference': [], 'prediction':[], 'questions':[], 'dimension':[]} + dindex = 0 + for index, row in filter_data.iterrows(): + if dindex != row['dindex']: + third_task_data.append(same_vision_qas) + same_vision_qas = {'reference': [], 'prediction':[], 'questions':[], 'dimension':[]} + same_vision_qas['reference'].append(row['answer']) + same_vision_qas['prediction'].append(row['prediction']) + same_vision_qas['questions'].append(row['question']) + same_vision_qas['dimension'].append(misc.toliststr(row['dimension'])) + dindex = row['dindex'] + third_task_data.append(same_vision_qas) + + # compute score + third_rows +=1 + model_scores = [third_task] + ques_total_num, right_num, obey_instruction, others = func_mapping[third_task](third_task_data, model_name) + + # weighted sum score + if third_task in weights: + weight = weights[third_task] + else: + weight = [0, 0.8, 0.2] + temp_score = 100*others*weight[0] + 100*right_num*weight[1] + 100*obey_instruction*weight[2] + + model_scores.append(temp_score) + model_scores.insert(1,ques_total_num) + all_results.append(model_scores) + total_results.append(model_scores) + + all_results = weighted_row_sum(all_results,third_rows) + + total_ = weighted_total(total_results) + all_results.append(total_) + df = pd.DataFrame(all_results, columns=['Task','num', model_name]) + return df + + def evaluate(self, eval_file: str, **judge_kwargs: Any) -> pd.DataFrame: + """ + Evaluate model predictions on the ChartQAPro dataset. + + Args: + eval_file: Path to the file containing model predictions + **judge_kwargs: Additional arguments for the judge model + + Returns: + DataFrame with evaluation scores by category + """ + print(eval_file) + score = self.get_scores(eval_file) + score_file = get_intermediate_file_path(eval_file, f'_score', 'csv') + file.dump(score, score_file) + return score \ No newline at end of file