Merge branch 'main' of https://github.com/Open-DataFlow/Open-DataFlow-Eval into main

MOLYHECI · MOLYHECI · commit 2232633b61fc · 2024-12-20T15:38:27.000+08:00
diff --git a/README.md b/README.md
@@ -62,8 +62,54 @@ pip install -e .
 ```
 
 If you want to evaluate each modality of data, please use the following commands:
+<details>
+<summary>
+<b>text data eval</b>
+</summary>
+<p>
+
+```bash
+pip install -e .[text]
+pip install flash-attn==2.6.3
+python -m spacy download en_core_web_sm
+```
+
+</p>
+</details>
+
+<details>
+<summary>
+<b>image data eval</b>
+</summary>
+<p>
+
+```bash
+pip install -e .[image]
+pip install pyiqa==0.1.12
+pip install transformers==4.44.2
+```
+
+</p>
+</details>
+
+
+<details>
+<summary>
+<b>video data eval</b>
+</summary>
+<p>
+
+```bash
+pip install -e .[video]
+```
+When evaluating video-caption data, please run the following command to install modified CLIP for EMScore:
+```
+pip install git+https://github.com/MOLYHECI/CLIP.git
+```
+
+</p>
+</details>
 
-All dependencies can be installed by:
 <details>
 <summary>
 <b>All dependencies</b>
diff --git a/README.zh-CN.md b/README.zh-CN.md
@@ -56,18 +56,64 @@ DataFlow-Eval 是一个数据质量评估系统，可以从多个维度评估数
 
 您可以用如下命令配置conda环境
 ```
-
 conda create -n dataflow python=3.9
 
 conda activate dataflow
 
 pip install -e .
-
 ```
 
   
 如果您想评估单个模态的数据，可以使用下面的安装代码👇
 
+<details>
+<summary>
+<b>text data eval</b>
+</summary>
+<p>
+
+```bash
+pip install -e .[text]
+pip install flash-attn==2.6.3
+python -m spacy download en_core_web_sm
+```
+
+</p>
+</details>
+
+<details>
+<summary>
+<b>image data eval</b>
+</summary>
+<p>
+
+```bash
+pip install -e .[image]
+pip install pyiqa==0.1.12
+pip install transformers==4.44.2
+```
+
+</p>
+</details>
+
+
+<details>
+<summary>
+<b>video data eval</b>
+</summary>
+<p>
+
+```bash
+pip install -e .[video]
+```
+When evaluating video-caption data, please run the following command to install modified CLIP for EMScore:
+```
+pip install git+https://github.com/MOLYHECI/CLIP.git
+```
+
+</p>
+</details>
+
 <details>
 <summary>
 <b>All dependencies</b>
@@ -84,14 +130,10 @@ pip install transformers==4.44.2
 </p>
 </details>
 
-
-  
 请参考[数据评估文档](#数据评估文档)查看参数的使用规则. 仅使用yaml参数便可以完成数据评估：
 
 ```
-
 python test.py --config [your config file]
-
 ```
 <p align="center">
   <img src="./static/images/example_1.png">
diff --git a/configs/text_scorer_pt.yaml b/configs/text_scorer_pt.yaml
@@ -5,10 +5,10 @@ dependencies: [text]
 
 data:
   text:
-    use_hf: False # Whether to use huggingface_dataset, if used, ignore the local data path below
-    dataset_name: 'yahma/alpaca-cleaned'
-    dataset_split: 'train'  
-    name: 'default' 
+    use_hf: False # Whether to use onlined Huggingface dataset, if used, ignore the local data path below
+    dataset_name: 'yahma/alpaca-cleaned' # Huggingface dataset: dataset name
+    dataset_split: 'train' # Huggingface dataset: dataset split
+    name: 'default' # Huggingface dataset: subset name
     
     data_path: 'demos/text_eval/fineweb_5_samples.json'  # Local data path, supports json, jsonl, parquet formats
     formatter: "TextFormatter" # Data loader type
@@ -31,4 +31,4 @@ scorers: # You can select multiple text scorers from all_scorers.yaml and put th
         - educational_value
   PresidioScorer:
       language: 'en'
-      device: 'cuda:0'
+      device: 'cuda:0'
diff --git a/configs/text_scorer_sft.yaml b/configs/text_scorer_sft.yaml
@@ -5,10 +5,10 @@ dependencies: [text]
 
 data:
   text:
-    use_hf: True # Whether to use huggingface_dataset, if used, ignore the local data path below
-    dataset_name: 'yahma/alpaca-cleaned'
-    dataset_split: 'train'  
-    name: 'default' 
+    use_hf: False # Whether to use onlined Huggingface dataset, if used, ignore the local data path below
+    dataset_name: 'yahma/alpaca-cleaned' # Huggingface dataset: dataset name
+    dataset_split: 'train' # Huggingface dataset: dataset split
+    name: 'default' # Huggingface dataset: subset name
     
     data_path: 'demos/text_eval/alpaca_5_samples.json'  # Local data path, supports json, jsonl, parquet formats
     formatter: "TextFormatter" # Data loader type
@@ -19,4 +19,4 @@ scorers: # You can select multiple text scorers from all_scorers.yaml and put th
   DeitaQualityScorer:
       device: 'cuda:0'
       model_name: 'hkust-nlp/deita-quality-scorer'
-      max_length: 512
+      max_length: 512
diff --git a/dataflow/Eval/Text/README.md b/dataflow/Eval/Text/README.md
@@ -12,10 +12,10 @@ model_cache_path: '../ckpt' # cache path for models
 
 data:
   text:
-    use_hf: False # Whether to use huggingface_dataset, if used, ignore the local data path below
-    dataset_name: 'yahma/alpaca-cleaned'
-    dataset_split: 'train'  
-    name: 'default' 
+    use_hf: False # Whether to use onlined Huggingface dataset, if used, ignore the local data path below
+    dataset_name: 'yahma/alpaca-cleaned' # Huggingface dataset: dataset name
+    dataset_split: 'train' # Huggingface dataset: dataset split
+    name: 'default' # Huggingface dataset: subset name
     
     data_path: 'demos/text_eval/fineweb_5_samples.json'  # Local data path, supports json, jsonl, parquet formats
     formatter: "TextFormatter" # Data loader type
@@ -152,4 +152,4 @@ calculate_score(save_path='./scores.json')
         }
     }
 }
-```
+```
diff --git a/dataflow/Eval/Text/README.zh-CN.md b/dataflow/Eval/Text/README.zh-CN.md
@@ -13,10 +13,10 @@ model_cache_path: '../ckpt' # 模型默认缓存路径
 
 data:
   text:
-    use_hf: False # 是否使用huggingface_dataset，如果使用则忽略下方本地数据地址
-    dataset_name: 'yahma/alpaca-cleaned'
-    dataset_split: 'train'  
-    name: 'default' 
+    use_hf: False # 是否使用在线的Huggingface数据集，如果使用则忽略下方本地数据地址
+    dataset_name: 'yahma/alpaca-cleaned' # Huggingface数据集：数据集名称
+    dataset_split: 'train'  # Huggingface数据集：数据集分区名
+    name: 'default' # Huggingface数据集：数据集子集名
     
     data_path: 'demos/text_eval/fineweb_5_samples.json'  # 本地数据地址，支持json、jsonl、parquet格式
     formatter: "TextFormatter" # 数据加载器类型
@@ -155,4 +155,4 @@ calculate_score(save_path='./scores.json')
         }
     }
 }
-```
+```
diff --git a/dataflow/__init__.py b/dataflow/__init__.py
@@ -1,6 +1,6 @@
 from .config import *
 from .utils import *
-# from .Eval import *
+from .Eval import *
 from .format import *
 
-from .utils.utils import list_image_eval_metrics, get_scorer
+from .utils.utils import list_image_eval_metrics, get_scorer
diff --git a/dataflow/format/text_formatter.py b/dataflow/format/text_formatter.py
@@ -1,4 +1,4 @@
-from datasets import load_dataset
+import datasets
 import json
 import pyarrow.parquet as pq
 from dataflow.utils.registry import FORMATTER_REGISTRY
@@ -38,7 +38,7 @@ def load_hf_dataset(self, dataset_name, dataset_split=None, name=None, keys=None
             "name": name                  
         }
         
-        dataset = load_dataset(**{k: v for k, v in load_kwargs.items() if v is not None})
+        dataset = datasets.load_dataset(**{k: v for k, v in load_kwargs.items() if v is not None})
 
         metadata = {
             "description": dataset.info.description if hasattr(dataset, "info") else None,
diff --git a/dataflow/utils/utils.py b/dataflow/utils/utils.py
@@ -69,6 +69,8 @@ def recursive_len(scores: dict):
             return recursive_len(v)
         elif isinstance(v, np.ndarray):
             return v.shape[0]
+        elif isinstance(v, list):
+            return len(v)
         else:
             raise ValueError(f"Invalid scores type {type(v)} returned")
         

Original file line number	Diff line number	Diff line change
`@@ -12,10 +12,10 @@ model_cache_path: '../ckpt' # cache path for models`
`12`	`12`
`13`	`13`	`data:`
`14`	`14`	`text:`
`15`		`- use_hf: False # Whether to use huggingface_dataset, if used, ignore the local data path below`
`16`		`- dataset_name: 'yahma/alpaca-cleaned'`
`17`		`- dataset_split: 'train'`
`18`		`- name: 'default'`
	`15`	`+ use_hf: False # Whether to use onlined Huggingface dataset, if used, ignore the local data path below`
	`16`	`+ dataset_name: 'yahma/alpaca-cleaned' # Huggingface dataset: dataset name`
	`17`	`+ dataset_split: 'train' # Huggingface dataset: dataset split`
	`18`	`+ name: 'default' # Huggingface dataset: subset name`
`19`	`19`
`20`	`20`	`data_path: 'demos/text_eval/fineweb_5_samples.json' # Local data path, supports json, jsonl, parquet formats`
`21`	`21`	`formatter: "TextFormatter" # Data loader type`
`@@ -152,4 +152,4 @@ calculate_score(save_path='./scores.json')`
`152`	`152`	`}`
`153`	`153`	`}`
`154`	`154`	`}`
`155`		-```
	`155`	+```
Original file line number	Diff line number	Diff line change
`@@ -13,10 +13,10 @@ model_cache_path: '../ckpt' # 模型默认缓存路径`
`13`	`13`
`14`	`14`	`data:`
`15`	`15`	`text:`
`16`		`- use_hf: False # 是否使用huggingface_dataset，如果使用则忽略下方本地数据地址`
`17`		`- dataset_name: 'yahma/alpaca-cleaned'`
`18`		`- dataset_split: 'train'`
`19`		`- name: 'default'`
	`16`	`+ use_hf: False # 是否使用在线的Huggingface数据集，如果使用则忽略下方本地数据地址`
	`17`	`+ dataset_name: 'yahma/alpaca-cleaned' # Huggingface数据集：数据集名称`
	`18`	`+ dataset_split: 'train' # Huggingface数据集：数据集分区名`
	`19`	`+ name: 'default' # Huggingface数据集：数据集子集名`
`20`	`20`
`21`	`21`	`data_path: 'demos/text_eval/fineweb_5_samples.json' # 本地数据地址，支持json、jsonl、parquet格式`
`22`	`22`	`formatter: "TextFormatter" # 数据加载器类型`
`@@ -155,4 +155,4 @@ calculate_score(save_path='./scores.json')`
`155`	`155`	`}`
`156`	`156`	`}`
`157`	`157`	`}`
`158`		-```
	`158`	+```