[pipeline] Batched pipeline and storage (#418)

fatty-belly · SunnyHaze · web-flow · commit 275155fd7435 · 2025-12-25T19:22:26.000+08:00
* [Pipeline] Batched pipeline and storage

* [Pipeline] Batched pipeline and storage

* [storage] add warning `TODO` for optimization

---------

Co-authored-by: Sunnyhaze &lt;mxch1122@126.com&gt;
diff --git a/dataflow/pipeline/Pipeline.py b/dataflow/pipeline/Pipeline.py
@@ -531,3 +531,79 @@ def _compiled_forward(self, resume_step: int=0):
                     self.logger.debug(f"Detected LLM Serving {self.active_llm_serving} ref reduced to 0, cleaning up...")
                     self.active_llm_serving.cleanup()
                     self.active_llm_serving = None
+
+
+class BatchedPipelineABC(PipelineABC):
+    def __init__(self):
+        super().__init__()
+        
+    def _compiled_forward(self, resume_step: int=0, batch_size: int|None=None, resume_from_last: bool=True):
+        """
+            resume_step (int): resume inference from this step
+            batch_size (int|None): if set, run the pipeline in batch mode with this batch size
+            resume_from_last (bool): if True, resume from the last successful step and batch
+        """
+        if resume_step > 0 and resume_from_last:
+            raise ValueError("Cannot set both `resume_step` and `resume_from_last` to True.")
+        
+        resume_batch = 0
+        
+        if resume_from_last:
+            cache_path = os.path.join(self.op_nodes_list[1].storage.cache_path, "last_success_step.txt")
+            if not os.path.exists(cache_path):
+                resume_step = 0
+                resume_batch = 0
+                self.logger.info(f"No last success step cache found at {cache_path}, starting from step 0.")
+            else:
+                with open(cache_path, "r") as f:
+                    line = f.readline().strip()
+                    resume_step, resume_batch = map(int, line.split(","))
+                self.logger.info(f"Resuming from last success step {resume_step}, batch step {resume_batch}.")
+                
+        # for loop for each op and its `storage` status       
+        for idx, op_node in enumerate(self.op_nodes_list):
+            # resume from a expected step
+            if idx - 1 < resume_step: # minus one since INPUT-DATA Node
+                continue
+
+            self.logger.debug(f"Ready to run {op_node}, with serving={op_node.llm_serving}, active_llm_serving={self.active_llm_serving}")
+            if op_node.llm_serving != None:
+                if self.active_llm_serving and self.active_llm_serving is not op_node.llm_serving:
+                    self.logger.debug(f"Detected active LLM Serving {self.active_llm_serving}, new serving {op_node.llm_serving}, cleaning up...")
+                    self.active_llm_serving.cleanup()
+                self.active_llm_serving = op_node.llm_serving
+
+            if op_node.op_obj != None:
+                if batch_size is not None:
+                    storage = op_node.storage
+                    storage.batch_step = 0 if idx - 1 > resume_step else resume_batch
+                    if storage.batch_size != batch_size:
+                        self.logger.info(f"Overriding storage {storage}'s batch size from {storage.batch_size} to {batch_size} for this run.")
+                        storage.batch_size = batch_size
+                    storage.read() # read to set data count
+                    record_count = storage.record_count
+                            
+                RUN_TIMES = 1 if batch_size is None else ((record_count - 1) // batch_size + 1) - storage.batch_step
+                if batch_size is not None: 
+                    self.logger.info(f"Pipeline will run for {RUN_TIMES} iterations to cover {record_count} records with batch size {batch_size}.")
+                for _ in range(RUN_TIMES):
+                    op_node.op_obj.run(
+                        storage=op_node.storage,
+                        **op_node.kwargs
+                    )
+                    if batch_size is not None:
+                        op_node.storage.batch_step += 1
+                    if resume_from_last:
+                        resume_batch = op_node.storage.batch_step if batch_size is not None else 0
+                        with open(cache_path, "w") as f:
+                            f.write(f"{idx-1},{resume_batch}\n")
+            if resume_from_last:
+                resume_batch = 0 # reset for next op_node
+                with open(cache_path, "w") as f:
+                    f.write(f"{idx},{resume_batch}\n")
+            if op_node.llm_serving != None:
+                self.llm_serving_counter[self.active_llm_serving] -= 1
+                if self.llm_serving_counter[self.active_llm_serving] == 0:
+                    self.logger.debug(f"Detected LLM Serving {self.active_llm_serving} ref reduced to 0, cleaning up...")
+                    self.active_llm_serving.cleanup()
+                    self.active_llm_serving = None
diff --git a/dataflow/pipeline/__init__.py b/dataflow/pipeline/__init__.py
@@ -1,5 +1,6 @@
-from .Pipeline import PipelineABC
+from .Pipeline import PipelineABC, BatchedPipelineABC
 
 __all__ = [
     'PipelineABC',
+    'BatchedPipelineABC',
 ]
diff --git a/dataflow/utils/storage.py b/dataflow/utils/storage.py
@@ -3,6 +3,7 @@
 import signal
 import tempfile
 import weakref
+
 from dataflow import get_logger
 import pandas as pd
 import json
@@ -811,4 +812,148 @@ def get_keys_from_dataframe(self) -> list[str]:
         Returns column names from the dataframe after reading from database.
         """
         dataframe = self.read(output_type="dataframe")
-        return dataframe.columns.tolist() if isinstance(dataframe, pd.DataFrame) else []
+        return dataframe.columns.tolist() if isinstance(dataframe, pd.DataFrame) else []
+    
+    
+class BatchedFileStorage(FileStorage):
+    """
+    批量文件存储，支持按批次读写数据。
+    """
+    def __init__(
+        self, 
+        first_entry_file_name: str,
+        cache_path:str="./cache",
+        file_name_prefix:str="dataflow_cache_step",
+        cache_type:Literal["jsonl", "csv"] = "jsonl",
+        batch_size: int = 10000
+    ):
+        super().__init__(first_entry_file_name, cache_path, file_name_prefix, cache_type)
+        self.batch_size = batch_size
+        self.batch_step = 0
+        if cache_type not in ["jsonl", "csv"]:
+            raise ValueError(f"BatchedFileStorage only supports 'jsonl' and 'csv' cache types, got: {cache_type}")
+        
+    def read(self, output_type: Literal["dataframe", "dict"]="dataframe") -> Any:
+        """
+        Read data from current file managed by storage.
+        
+        Args:
+            output_type: Type that you want to read to, either "dataframe" or "dict".
+            Also supports remote datasets with prefix:
+                - "hf:{dataset_name}{:config}{:split}"  => HuggingFace dataset eg. "hf:openai/gsm8k:main:train"
+                - "ms:{dataset_name}{}:split}"          => ModelScope dataset eg. "ms:modelscope/gsm8k:train"
+        
+        Returns:
+            Depending on output_type:
+            - "dataframe": pandas DataFrame
+            - "dict": List of dictionaries
+        
+        Raises:
+            ValueError: For unsupported file types or output types
+        """
+        if self.operator_step == 0 and self.first_entry_file_name == "":
+            self.logger.info("first_entry_file_name is empty, returning empty dataframe")
+            empty_dataframe = pd.DataFrame()
+            return self._convert_output(empty_dataframe, output_type)
+
+        file_path = self._get_cache_file_path(self.operator_step)
+        self.logger.info(f"Reading data from {file_path} with type {output_type}")
+
+        if self.operator_step == 0:
+            source = self.first_entry_file_name
+            self.logger.info(f"Reading remote dataset from {source} with type {output_type}")
+            if source.startswith("hf:"):
+                from datasets import load_dataset
+                _, dataset_name, *parts = source.split(":")
+
+                if len(parts) == 1:
+                    config, split = None, parts[0]
+                elif len(parts) == 2:
+                    config, split = parts
+                else:
+                    config, split = None, "train"
+
+                dataset = (
+                    load_dataset(dataset_name, config, split=split) 
+                    if config 
+                    else load_dataset(dataset_name, split=split)
+                )
+                dataframe = dataset.to_pandas()
+                return self._convert_output(dataframe, output_type)
+        
+            elif source.startswith("ms:"):
+                from modelscope import MsDataset
+                _, dataset_name, *split_parts = source.split(":")
+                split = split_parts[0] if split_parts else "train"
+
+                dataset = MsDataset.load(dataset_name, split=split)
+                dataframe = pd.DataFrame(dataset)
+                return self._convert_output(dataframe, output_type)
+                            
+            else:
+                local_cache = file_path.split(".")[-1]
+        else:
+            local_cache = self.cache_type
+        # TODO Code below may be a bottleneck for large files, consider optimizing later
+        dataframe = self._load_local_file(file_path, local_cache)
+        self.record_count = len(dataframe)
+        # 读出当前批次数据
+        dataframe = dataframe.iloc[
+            self.batch_step * self.batch_size : (self.batch_step + 1) * self.batch_size
+        ]
+        return self._convert_output(dataframe, output_type)
+    
+    def write(self, data: Any) -> Any:
+        """
+        Write data to current file managed by storage.
+        data: Any, the data to write, it should be a dataframe, List[dict], etc.
+        """
+        def clean_surrogates(obj):
+            """递归清理数据中的无效Unicode代理对字符"""
+            if isinstance(obj, str):
+                # 替换无效的Unicode代理对字符（如\udc00）
+                return obj.encode('utf-8', 'replace').decode('utf-8')
+            elif isinstance(obj, dict):
+                return {k: clean_surrogates(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [clean_surrogates(item) for item in obj]
+            elif isinstance(obj, (int, float, bool)) or obj is None:
+                # 数字、布尔值和None直接返回
+                return obj
+            else:
+                # 其他类型（如自定义对象）尝试转为字符串处理
+                try:
+                    return clean_surrogates(str(obj))
+                except:
+                    # 如果转换失败，返回原对象或空字符串（根据需求选择）
+                    return obj
+
+        # 转换数据为DataFrame
+        if isinstance(data, list):
+            if len(data) > 0 and isinstance(data[0], dict):
+                # 清洗列表中的每个字典
+                cleaned_data = [clean_surrogates(item) for item in data]
+                dataframe = pd.DataFrame(cleaned_data)
+            else:
+                raise ValueError(f"Unsupported data type: {type(data[0])}")
+        elif isinstance(data, pd.DataFrame):
+            # 对DataFrame的每个元素进行清洗
+            dataframe = data.map(clean_surrogates)
+        else:
+            raise ValueError(f"Unsupported data type: {type(data)}")
+
+        file_path = self._get_cache_file_path(self.operator_step + 1)
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        self.logger.success(f"Writing data to {file_path} with type {self.cache_type}")
+        if self.cache_type == "jsonl":
+            with open(file_path, 'a', encoding='utf-8') as f:
+                dataframe.to_json(f, orient="records", lines=True, force_ascii=False)
+        elif self.cache_type == "csv":
+            if self.batch_step == 0:
+                dataframe.to_csv(file_path, index=False)
+            else:
+                dataframe.to_csv(file_path, index=False, header=False, mode='a')
+        else:
+            raise ValueError(f"Unsupported file type: {self.cache_type}, output file should end with jsonl, csv")
+        
+        return file_path 
diff --git a/test/test_batched_pipeline.py b/test/test_batched_pipeline.py
@@ -0,0 +1,60 @@
+import re
+from dataflow.pipeline import BatchedPipelineABC
+from dataflow.operators.general_text import (
+    LLMLanguageFilter,
+)
+from dataflow.operators.text_pt import MetaSampleEvaluator
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import BatchedFileStorage
+
+class AutoOPPipeline(BatchedPipelineABC):
+    
+    def __init__(self):
+        super().__init__()
+        self.storage = BatchedFileStorage(
+            first_entry_file_name="./dataflow/example/GeneralTextPipeline/pt_input.jsonl",
+            cache_path="./cache_autoop",
+            file_name_prefix="dataflow_cache_auto_run",
+            cache_type="jsonl",
+            batch_size=2
+        )
+        self.llm_serving1 = APILLMServing_request(
+                api_url="http://123.129.219.111:3000/v1/chat/completions",
+                model_name="gpt-5-mini",
+                max_workers=100,
+        )
+        self.op1 = PromptedGenerator(
+            llm_serving=self.llm_serving1,
+            system_prompt="请将以下内容翻译成中文：",
+        )
+        self.op2 = PromptedGenerator(
+            llm_serving=self.llm_serving1,
+            system_prompt="请将以下内容翻译成韩文：",
+        )
+        self.op3 = PromptedGenerator(
+            llm_serving=self.llm_serving1,
+            system_prompt="请将以下内容翻译成日语："
+        )
+        
+    def forward(self):
+        self.op1.run(
+            self.storage.step(),
+            input_key='raw_content',
+            output_key='content_cn1'
+        )
+        self.op2.run(
+            self.storage.step(),
+            input_key='raw_content',
+            output_key='content_cn2'
+        )
+        self.op3.run(
+            self.storage.step(),
+            input_key='raw_content',
+            output_key='content_cn3'
+        )
+        
+if __name__ == "__main__":
+    pipeline = AutoOPPipeline()
+    pipeline.compile()
+    pipeline.forward(batch_size=2, resume_from_last=True)

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`		`-from .Pipeline import PipelineABC`
	`1`	`+from .Pipeline import PipelineABC, BatchedPipelineABC`
`2`	`2`
`3`	`3`	`__all__ = [`
`4`	`4`	`'PipelineABC',`
	`5`	`+ 'BatchedPipelineABC',`
`5`	`6`	`]`