33import signal
44import tempfile
55import weakref
6+
67from dataflow import get_logger
78import pandas as pd
89import json
@@ -811,4 +812,148 @@ def get_keys_from_dataframe(self) -> list[str]:
811812 Returns column names from the dataframe after reading from database.
812813 """
813814 dataframe = self .read (output_type = "dataframe" )
814- return dataframe .columns .tolist () if isinstance (dataframe , pd .DataFrame ) else []
815+ return dataframe .columns .tolist () if isinstance (dataframe , pd .DataFrame ) else []
816+
817+
818+ class BatchedFileStorage (FileStorage ):
819+ """
820+ 批量文件存储,支持按批次读写数据。
821+ """
822+ def __init__ (
823+ self ,
824+ first_entry_file_name : str ,
825+ cache_path :str = "./cache" ,
826+ file_name_prefix :str = "dataflow_cache_step" ,
827+ cache_type :Literal ["jsonl" , "csv" ] = "jsonl" ,
828+ batch_size : int = 10000
829+ ):
830+ super ().__init__ (first_entry_file_name , cache_path , file_name_prefix , cache_type )
831+ self .batch_size = batch_size
832+ self .batch_step = 0
833+ if cache_type not in ["jsonl" , "csv" ]:
834+ raise ValueError (f"BatchedFileStorage only supports 'jsonl' and 'csv' cache types, got: { cache_type } " )
835+
836+ def read (self , output_type : Literal ["dataframe" , "dict" ]= "dataframe" ) -> Any :
837+ """
838+ Read data from current file managed by storage.
839+
840+ Args:
841+ output_type: Type that you want to read to, either "dataframe" or "dict".
842+ Also supports remote datasets with prefix:
843+ - "hf:{dataset_name}{:config}{:split}" => HuggingFace dataset eg. "hf:openai/gsm8k:main:train"
844+ - "ms:{dataset_name}{}:split}" => ModelScope dataset eg. "ms:modelscope/gsm8k:train"
845+
846+ Returns:
847+ Depending on output_type:
848+ - "dataframe": pandas DataFrame
849+ - "dict": List of dictionaries
850+
851+ Raises:
852+ ValueError: For unsupported file types or output types
853+ """
854+ if self .operator_step == 0 and self .first_entry_file_name == "" :
855+ self .logger .info ("first_entry_file_name is empty, returning empty dataframe" )
856+ empty_dataframe = pd .DataFrame ()
857+ return self ._convert_output (empty_dataframe , output_type )
858+
859+ file_path = self ._get_cache_file_path (self .operator_step )
860+ self .logger .info (f"Reading data from { file_path } with type { output_type } " )
861+
862+ if self .operator_step == 0 :
863+ source = self .first_entry_file_name
864+ self .logger .info (f"Reading remote dataset from { source } with type { output_type } " )
865+ if source .startswith ("hf:" ):
866+ from datasets import load_dataset
867+ _ , dataset_name , * parts = source .split (":" )
868+
869+ if len (parts ) == 1 :
870+ config , split = None , parts [0 ]
871+ elif len (parts ) == 2 :
872+ config , split = parts
873+ else :
874+ config , split = None , "train"
875+
876+ dataset = (
877+ load_dataset (dataset_name , config , split = split )
878+ if config
879+ else load_dataset (dataset_name , split = split )
880+ )
881+ dataframe = dataset .to_pandas ()
882+ return self ._convert_output (dataframe , output_type )
883+
884+ elif source .startswith ("ms:" ):
885+ from modelscope import MsDataset
886+ _ , dataset_name , * split_parts = source .split (":" )
887+ split = split_parts [0 ] if split_parts else "train"
888+
889+ dataset = MsDataset .load (dataset_name , split = split )
890+ dataframe = pd .DataFrame (dataset )
891+ return self ._convert_output (dataframe , output_type )
892+
893+ else :
894+ local_cache = file_path .split ("." )[- 1 ]
895+ else :
896+ local_cache = self .cache_type
897+ # TODO Code below may be a bottleneck for large files, consider optimizing later
898+ dataframe = self ._load_local_file (file_path , local_cache )
899+ self .record_count = len (dataframe )
900+ # 读出当前批次数据
901+ dataframe = dataframe .iloc [
902+ self .batch_step * self .batch_size : (self .batch_step + 1 ) * self .batch_size
903+ ]
904+ return self ._convert_output (dataframe , output_type )
905+
906+ def write (self , data : Any ) -> Any :
907+ """
908+ Write data to current file managed by storage.
909+ data: Any, the data to write, it should be a dataframe, List[dict], etc.
910+ """
911+ def clean_surrogates (obj ):
912+ """递归清理数据中的无效Unicode代理对字符"""
913+ if isinstance (obj , str ):
914+ # 替换无效的Unicode代理对字符(如\udc00)
915+ return obj .encode ('utf-8' , 'replace' ).decode ('utf-8' )
916+ elif isinstance (obj , dict ):
917+ return {k : clean_surrogates (v ) for k , v in obj .items ()}
918+ elif isinstance (obj , list ):
919+ return [clean_surrogates (item ) for item in obj ]
920+ elif isinstance (obj , (int , float , bool )) or obj is None :
921+ # 数字、布尔值和None直接返回
922+ return obj
923+ else :
924+ # 其他类型(如自定义对象)尝试转为字符串处理
925+ try :
926+ return clean_surrogates (str (obj ))
927+ except :
928+ # 如果转换失败,返回原对象或空字符串(根据需求选择)
929+ return obj
930+
931+ # 转换数据为DataFrame
932+ if isinstance (data , list ):
933+ if len (data ) > 0 and isinstance (data [0 ], dict ):
934+ # 清洗列表中的每个字典
935+ cleaned_data = [clean_surrogates (item ) for item in data ]
936+ dataframe = pd .DataFrame (cleaned_data )
937+ else :
938+ raise ValueError (f"Unsupported data type: { type (data [0 ])} " )
939+ elif isinstance (data , pd .DataFrame ):
940+ # 对DataFrame的每个元素进行清洗
941+ dataframe = data .map (clean_surrogates )
942+ else :
943+ raise ValueError (f"Unsupported data type: { type (data )} " )
944+
945+ file_path = self ._get_cache_file_path (self .operator_step + 1 )
946+ os .makedirs (os .path .dirname (file_path ), exist_ok = True )
947+ self .logger .success (f"Writing data to { file_path } with type { self .cache_type } " )
948+ if self .cache_type == "jsonl" :
949+ with open (file_path , 'a' , encoding = 'utf-8' ) as f :
950+ dataframe .to_json (f , orient = "records" , lines = True , force_ascii = False )
951+ elif self .cache_type == "csv" :
952+ if self .batch_step == 0 :
953+ dataframe .to_csv (file_path , index = False )
954+ else :
955+ dataframe .to_csv (file_path , index = False , header = False , mode = 'a' )
956+ else :
957+ raise ValueError (f"Unsupported file type: { self .cache_type } , output file should end with jsonl, csv" )
958+
959+ return file_path
0 commit comments