-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit-csv.py
More file actions
98 lines (73 loc) · 3.09 KB
/
split-csv.py
File metadata and controls
98 lines (73 loc) · 3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import os
from pathlib import Path
import glob
def get_file_size_mb(filepath):
return os.path.getsize(filepath) / (1024 * 1024)
def sanitize_name(name):
invalid = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
for char in invalid:
name = name.replace(char, '_')
return name.strip()
def split_sheet(df, sheet_name, base_name, output_dir, max_size_mb):
if len(df) == 0:
return 0
safe_sheet = sanitize_name(sheet_name)
safe_base = sanitize_name(base_name)
# Estimate rows per segment
sample = df.head(min(100, len(df)))
temp_file = os.path.join(output_dir, "temp_test.xlsx")
sample.to_excel(temp_file, index=False, engine='openpyxl')
sample_size = get_file_size_mb(temp_file)
os.remove(temp_file)
rows_per_mb = len(sample) / sample_size if sample_size > 0 else 100
chunk_size = max(10, int(rows_per_mb * max_size_mb * 0.85))
segment = 1
start = 0
created = 0
while start < len(df):
end = min(start + chunk_size, len(df))
chunk = df.iloc[start:end]
filename = f"{safe_base}_{safe_sheet}_seg{segment}.xlsx"
filepath = os.path.join(output_dir, filename)
chunk.to_excel(filepath, index=False, engine='openpyxl')
size = get_file_size_mb(filepath)
print(f" ✓ {filename} ({size:.2f} MB, {len(chunk)} rows)")
start = end
segment += 1
created += 1
return created
def process_all_excel_files(max_size_mb=4):
current_dir = os.getcwd()
excel_files = glob.glob("*.xlsx") + glob.glob("*.xls")
if not excel_files:
print("❌ No Excel files found in current directory!")
return
print(f"\n{'='*60}")
print(f"Found {len(excel_files)} Excel file(s) in: {current_dir}")
print(f"Target segment size: {max_size_mb} MB")
print(f"{'='*60}\n")
for excel_file in excel_files:
print(f"\n📁 Processing: {excel_file}")
print(f" Original size: {get_file_size_mb(excel_file):.2f} MB")
try:
xls = pd.ExcelFile(excel_file, engine='openpyxl')
base_name = Path(excel_file).stem
total_segments = 0
for sheet_name in xls.sheet_names:
df = pd.read_excel(excel_file, sheet_name=sheet_name, engine='openpyxl')
if len(df) > 0:
print(f"\n 📋 Sheet: '{sheet_name}' ({len(df)} rows)")
segs = split_sheet(df, sheet_name, base_name, current_dir, max_size_mb)
total_segments += segs
print(f"\n ✅ Done! Created {total_segments} segments from {excel_file}\n")
print("-" * 60)
except Exception as e:
print(f" ❌ Error: {str(e)}\n")
continue
print(f"\n{'='*60}")
print("🎉 ALL FILES PROCESSED!")
print(f"{'='*60}\n")
if __name__ == "__main__":
# Just run it - processes all Excel files in current directory
process_all_excel_files(max_size_mb=4)