Skip to content

Commit 1290c34

Browse files
committed
feat: add satellite data pipeline and automatic file cleanup mechanism
1 parent f5cbb7d commit 1290c34

File tree

8 files changed

+446
-122
lines changed

8 files changed

+446
-122
lines changed

.github/workflows/cleanup.yml

Lines changed: 0 additions & 103 deletions
This file was deleted.

.github/workflows/pytest.yml

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,19 +31,4 @@ jobs:
3131
run: |
3232
python -m pip install --upgrade pip
3333
pip install -e .
34-
pip install -e ".[test]"
35-
36-
- name: Run tests with coverage
37-
run: |
38-
pytest tests/ -m "not requires_data" \
39-
--cov=src \
40-
--cov-report=term-missing \
41-
--cov-report=xml \
42-
-v
43-
44-
- name: Upload coverage reports
45-
uses: actions/upload-artifact@v4
46-
with:
47-
name: coverage-report-${{ matrix.python-version }}-${{ github.sha }}
48-
path: coverage.xml
49-
if-no-files-found: error
34+
pip install -e ".[test]"

app_run.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
# 導入你的處理模組
88
from src.api.sentinel_api import S5PFetcher
9-
from src.config import setup_directory_structure
9+
# from src.config import setup_directory_structure
1010

1111

1212
class SatelliteApp:
@@ -136,7 +136,7 @@ def process_data(self, selected_data):
136136
data_mode = self.data_mode.get()
137137

138138
self.log_message(f"開始處理數據:{start_str}{end_str}")
139-
setup_directory_structure(start_str, end_str)
139+
# setup_directory_structure(start_str, end_str)
140140

141141
fetcher = S5PFetcher(max_workers=3)
142142

file_retention_manager.py

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
"""
2+
檔案保留期限管理系統
3+
用於自動清理超過保留期限的衛星數據檔案
4+
支持嵌套目錄結構: Satellite/figure/file_type/年份/月份/檔案
5+
"""
6+
import logging
7+
from pathlib import Path
8+
from datetime import datetime, timedelta
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
class FileRetentionManager:
14+
"""管理檔案保留期限,自動清理過期檔案"""
15+
16+
def __init__(self, retention_days):
17+
"""
18+
初始化檔案保留管理器
19+
20+
參數:
21+
retention_days (int): 要保留檔案的天數
22+
"""
23+
self.retention_days = retention_days
24+
25+
def clean_directories(self, base_dir, subdirs=None):
26+
"""
27+
清理特定目錄下超過保留期限的檔案
28+
29+
參數:
30+
base_dir (str or Path): 基礎目錄路徑
31+
subdirs (list): 子目錄列表,如果為None則直接清理base_dir
32+
33+
返回:
34+
int: 被清理的檔案數量
35+
"""
36+
base_path = Path(base_dir)
37+
38+
if not base_path.exists():
39+
logger.warning(f"目錄不存在: {base_path}")
40+
return 0
41+
42+
dirs_to_clean = []
43+
if subdirs:
44+
for subdir in subdirs:
45+
full_path = base_path / subdir
46+
if full_path.exists():
47+
dirs_to_clean.append(full_path)
48+
else:
49+
dirs_to_clean.append(base_path)
50+
51+
total_removed = 0
52+
cutoff_date = datetime.now() - timedelta(days=self.retention_days)
53+
54+
for directory in dirs_to_clean:
55+
removed = self._clean_directory(directory, cutoff_date)
56+
total_removed += removed
57+
58+
return total_removed
59+
60+
def _clean_directory(self, directory, cutoff_date):
61+
"""
62+
清理單個目錄中的舊檔案
63+
64+
參數:
65+
directory (Path): 目錄路徑
66+
cutoff_date (datetime): 截止日期,早於此日期的檔案將被刪除
67+
68+
返回:
69+
int: 被刪除的檔案數量
70+
"""
71+
logger.info(f"開始清理目錄: {directory}")
72+
removed_count = 0
73+
74+
# 獲取目錄中所有檔案
75+
files = [f for f in directory.iterdir() if f.is_file()]
76+
77+
for file_path in files:
78+
# 獲取檔案修改時間
79+
file_mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
80+
81+
# 如果檔案早於截止日期,則刪除
82+
if file_mtime < cutoff_date:
83+
try:
84+
file_path.unlink()
85+
logger.info(f"已刪除舊檔案: {file_path}")
86+
removed_count += 1
87+
except Exception as e:
88+
logger.error(f"刪除檔案 {file_path} 時出錯: {str(e)}")
89+
90+
return removed_count
91+
92+
def clean_satellite_figure_data(self, data_root, file_types=None):
93+
"""
94+
清理衛星圖像目錄,支持嵌套目錄結構: Satellite/figure/file_type/年份/月份/檔案
95+
96+
參數:
97+
data_root (str or Path): 數據根目錄 (通常是 Config.DATA_ROOT)
98+
file_types (list): 檔案類型列表,如 ['NO2____', 'CO_____'],如果為None則清理所有類型
99+
100+
返回:
101+
dict: 每個類型清理的檔案數量
102+
"""
103+
data_root_path = Path(data_root)
104+
figure_path = data_root_path / "figure"
105+
106+
if not figure_path.exists():
107+
logger.warning(f"衛星圖像目錄不存在: {figure_path}")
108+
return {}
109+
110+
results = {}
111+
112+
# 如果未指定file_types,則獲取所有子目錄作為file_types
113+
if file_types is None:
114+
file_types = [d.name for d in figure_path.iterdir() if d.is_dir()]
115+
116+
cutoff_date = datetime.now() - timedelta(days=self.retention_days)
117+
118+
# 遍歷每個文件類型目錄
119+
for file_type in file_types:
120+
file_type_dir = figure_path / file_type
121+
if not file_type_dir.exists():
122+
logger.warning(f"文件類型目錄不存在: {file_type_dir}")
123+
results[file_type] = 0
124+
continue
125+
126+
removed_count = 0
127+
128+
# 遍歷年份目錄
129+
for year_dir in [d for d in file_type_dir.iterdir() if d.is_dir()]:
130+
# 遍歷月份目錄
131+
for month_dir in [d for d in year_dir.iterdir() if d.is_dir()]:
132+
# 清理所有PNG檔案
133+
png_files = list(month_dir.glob("*.png"))
134+
135+
for png_file in png_files:
136+
# 獲取檔案修改時間
137+
file_mtime = datetime.fromtimestamp(png_file.stat().st_mtime)
138+
139+
# 如果檔案早於截止日期,則刪除
140+
if file_mtime < cutoff_date:
141+
try:
142+
png_file.unlink()
143+
logger.info(f"已刪除舊圖像檔案: {png_file}")
144+
removed_count += 1
145+
except Exception as e:
146+
logger.error(f"刪除檔案 {png_file} 時出錯: {str(e)}")
147+
148+
# 如果月份目錄為空,也刪除它
149+
if not any(month_dir.iterdir()):
150+
try:
151+
month_dir.rmdir()
152+
logger.info(f"已刪除空月份目錄: {month_dir}")
153+
except Exception as e:
154+
logger.error(f"刪除目錄 {month_dir} 時出錯: {str(e)}")
155+
156+
# 如果年份目錄為空,也刪除它
157+
if not any(year_dir.iterdir()):
158+
try:
159+
year_dir.rmdir()
160+
logger.info(f"已刪除空年份目錄: {year_dir}")
161+
except Exception as e:
162+
logger.error(f"刪除目錄 {year_dir} 時出錯: {str(e)}")
163+
164+
results[file_type] = removed_count
165+
166+
return results
167+
168+
def clean_all_satellite_data(self, data_root, file_types=None):
169+
"""
170+
清理所有衛星數據相關檔案
171+
172+
參數:
173+
data_root (str or Path): 數據根目錄 (通常是 Config.DATA_ROOT)
174+
file_types (list): 檔案類型列表,如 ['NO2____', 'CO_____']
175+
176+
返回:
177+
dict: 各類別被清理的檔案數量
178+
"""
179+
data_root_path = Path(data_root)
180+
results = {}
181+
182+
# 清理圖像檔案
183+
figure_results = self.clean_satellite_figure_data(data_root_path, file_types)
184+
results.update({f'figure_{k}': v for k, v in figure_results.items()})
185+
186+
# 清理下載的原始數據文件 (如果有)
187+
data_dir = data_root_path / "Satellite" / "data"
188+
if data_dir.exists():
189+
download_count = self.clean_directories(data_dir)
190+
results['data_files'] = download_count
191+
192+
# 清理處理後的數據文件 (如果有)
193+
processed_dir = data_root_path / "Satellite" / "processed"
194+
if processed_dir.exists():
195+
processed_count = self.clean_directories(processed_dir)
196+
results['processed_files'] = processed_count
197+
198+
# 清理標記檔案 (processed_*.flag)
199+
flag_dir = data_root_path / "Satellite"
200+
if flag_dir.exists():
201+
flag_count = self._clean_flag_files(flag_dir,
202+
cutoff_date=datetime.now() - timedelta(days=self.retention_days))
203+
results['flag_files'] = flag_count
204+
205+
return results
206+
207+
def _clean_flag_files(self, directory, cutoff_date):
208+
"""
209+
清理舊的標記檔案
210+
211+
參數:
212+
directory (Path): 目錄路徑
213+
cutoff_date (datetime): 截止日期
214+
215+
返回:
216+
int: 被刪除的檔案數量
217+
"""
218+
directory_path = Path(directory)
219+
flag_pattern = "processed_*.flag"
220+
flag_files = list(directory_path.glob(flag_pattern))
221+
222+
removed_count = 0
223+
224+
for flag_file in flag_files:
225+
# 從檔案名中提取日期
226+
try:
227+
file_name = flag_file.name
228+
date_str = file_name.replace("processed_", "").replace(".flag", "")
229+
file_date = datetime.strptime(date_str, "%Y-%m-%d")
230+
231+
if file_date < cutoff_date:
232+
flag_file.unlink()
233+
logger.info(f"已刪除舊標記檔案: {flag_file}")
234+
removed_count += 1
235+
except Exception as e:
236+
logger.error(f"處理標記檔案 {flag_file} 時出錯: {str(e)}")
237+
238+
return removed_count

main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def main():
8989
# 步驟:
9090
# 1. 前往src.config.settings中更改輸出路徑(硬碟路徑)
9191
# 2. 設定參數
92-
start, end = '2022-01-01', '2022-01-31'
92+
start, end = '2022-01-01', '2024-12-31'
9393
file_class: ClassInput = 'OFFL'
9494
file_type: TypeInput = 'NO2___'
9595

0 commit comments

Comments
 (0)