2020import urllib .parse
2121import urllib .request
2222import urllib .error
23- from concurrent .futures import ThreadPoolExecutor , as_completed , wait , FIRST_COMPLETED
23+ from concurrent .futures import ThreadPoolExecutor , as_completed
2424from dataclasses import dataclass
2525from pathlib import Path
2626from typing import Literal , Optional
@@ -91,89 +91,126 @@ def http_get(url: str, headers: dict[str, str] | None = None) -> bytes:
9191
9292
9393def fetch_github (spec : RepoSpec , output : Path , token : Optional [str ] = None ) -> None :
94- """Download from GitHub using Contents API."""
94+ """Download from GitHub using Tree API (recursive) ."""
9595 token = token or os .getenv ("GITHUB_TOKEN" , "" )
9696 headers = {"Accept" : "application/vnd.github.v3+json" }
9797 if token :
9898 headers ["Authorization" ] = f"token { token } "
9999
100- files_to_download = []
101-
102- def process_node (current_spec : RepoSpec , current_output : Path ):
103- api_url = f"https://api.github.com/repos/{ current_spec .owner } /{ current_spec .repo } /contents/{ current_spec .path } "
104- if current_spec .branch != "main" :
105- api_url += f"?ref={ current_spec .branch } "
100+ # Fetch the entire tree recursively
101+ api_url = (
102+ f"https://api.github.com/repos/{ spec .owner } /{ spec .repo } /git/trees/"
103+ f"{ urllib .parse .quote (spec .branch , safe = '' )} ?recursive=1"
104+ )
106105
107- try :
108- data_bytes = http_get (api_url , headers )
109- data = json .loads (data_bytes )
110- except urllib .error .HTTPError as e :
111- if e .code == 404 :
112- # Fallback to raw file download if API fails (maybe it's a file, not dir)
113- raw_url = f"https://raw.githubusercontent.com/{ current_spec .owner } /{ current_spec .repo } /{ current_spec .branch } /{ current_spec .path } "
114- content = http_get (raw_url , headers )
115- current_output .parent .mkdir (parents = True , exist_ok = True )
116- current_output .write_bytes (content )
117- print (f"✓ { current_spec .path } " )
118- return [], []
106+ try :
107+ data_bytes = http_get (api_url , headers )
108+ data = json .loads (data_bytes )
109+ except urllib .error .HTTPError as e :
110+ if e .code == 404 :
111+ # Fallback: maybe spec.path is a file and not in a tree or branch issue?
112+ # Or the branch doesn't exist.
113+ # We can try raw download if spec.path is set, similar to original fallback.
114+ if spec .path :
115+ raw_url = f"https://raw.githubusercontent.com/{ spec .owner } /{ spec .repo } /{ spec .branch } /{ urllib .parse .quote (spec .path )} "
116+ try :
117+ content = http_get (raw_url , headers )
118+ output .parent .mkdir (parents = True , exist_ok = True )
119+ output .write_bytes (content )
120+ print (f"✓ { spec .path } " )
121+ return
122+ except urllib .error .HTTPError :
123+ pass # Original 404 was correct
119124 raise
125+ raise
120126
121- if isinstance (data , dict ):
122- data = [data ]
123-
124- local_files = []
125- local_dirs = []
126-
127- for item in data :
128- item_path = item ["path" ]
129- local_path = current_output / Path (item_path ).name
130-
131- if item ["type" ] == "file" :
132- local_files .append ((item ["download_url" ], local_path , item_path ))
133- elif item ["type" ] == "dir" :
134- local_path .mkdir (parents = True , exist_ok = True )
135- local_dirs .append ((item_path , local_path ))
127+ if data .get ("truncated" ):
128+ print (
129+ "Error: GitHub Tree API response is truncated; aborting to avoid an incomplete download." ,
130+ file = sys .stderr ,
131+ )
132+ sys .exit (1 )
136133
137- return local_files , local_dirs
134+ files_to_download = []
138135
139- max_workers = min (32 , (os .cpu_count () or 1 ) * 4 )
136+ # Filter items based on spec.path
137+ target_path = spec .path .strip ("/" )
140138
141- # Discovery phase
142- with ThreadPoolExecutor (max_workers = max_workers ) as executor :
143- futures = {}
139+ found_any = False
144140
145- def submit_spec (s , o ):
146- f = executor .submit (process_node , s , o )
147- futures [f ] = (s , o )
141+ for item in data .get ("tree" , []):
142+ item_path = item ["path" ]
148143
149- submit_spec (spec , output )
144+ # Check if item matches target_path
145+ if (
146+ target_path
147+ and item_path != target_path
148+ and not item_path .startswith (target_path + "/" )
149+ ):
150+ continue
151+
152+ found_any = True
153+
154+ # Determine local path
155+ if target_path :
156+ # Relative path from target_path
157+ rel_path = item_path [len (target_path ) :].lstrip ("/" )
158+ # Detect whether the user-supplied path was intended as a directory
159+ requested_is_dir = spec .path .endswith ("/" )
160+ if not rel_path and item_path == target_path :
161+ # Exact match of the target path
162+ if requested_is_dir and item ["type" ] != "tree" :
163+ raise ValueError (
164+ f"Requested path { spec .path !r} is a directory, but repository "
165+ f"contains a { item ['type' ]} at that path."
166+ )
167+ if not requested_is_dir and item ["type" ] != "blob" :
168+ raise ValueError (
169+ f"Requested path { spec .path !r} is a file, but repository "
170+ f"contains a { item ['type' ]} at that path."
171+ )
172+ # For an exact match with the expected type, use the output path directly.
173+ local_path = output
174+ else :
175+ local_path = output / rel_path
176+ else :
177+ local_path = output / item_path
150178
151- while futures :
152- done , _ = wait (futures , return_when = FIRST_COMPLETED )
153- for future in done :
154- s , o = futures .pop (future )
155- try :
156- f_list , d_list = future .result ()
157- files_to_download .extend (f_list )
179+ if item ["type" ] == "tree" :
180+ local_path .mkdir (parents = True , exist_ok = True )
181+ elif item ["type" ] == "blob" :
182+ encoded_path = "/" .join (urllib .parse .quote (p ) for p in item_path .split ("/" ))
183+ raw_url = f"https://raw.githubusercontent.com/{ spec .owner } /{ spec .repo } /{ spec .branch } /{ encoded_path } "
184+ files_to_download .append ((raw_url , local_path , item_path ))
158185
159- for item_path , local_path in d_list :
160- sub_spec = RepoSpec (
161- s .platform , s .owner , s .repo , item_path , s .branch
162- )
163- submit_spec (sub_spec , local_path )
186+ if not found_any :
187+ # If path not found in tree (or tree truncated), try raw download as fallback
188+ if target_path :
189+ raw_url = f"https://raw.githubusercontent.com/{ spec .owner } /{ spec .repo } /{ spec .branch } /{ urllib .parse .quote (target_path )} "
190+ try :
191+ content = http_get (raw_url , headers )
192+ output .parent .mkdir (parents = True , exist_ok = True )
193+ output .write_bytes (content )
194+ print (f"✓ { target_path } " )
195+ return
196+ except urllib .error .HTTPError :
197+ pass
164198
165- except Exception as e :
166- print (f"Error processing { s .path } : { e } " , file = sys .stderr )
167- raise
199+ print (f"✗ Path not found: { spec .path } " , file = sys .stderr )
200+ # We don't raise here to allow main to exit cleanly?
201+ # But original code raised or returned empty.
202+ # If we return, we print nothing else.
203+ return
168204
169205 if not files_to_download :
170206 return
171207
172208 # Parallel file downloads
209+ max_workers = min (32 , (os .cpu_count () or 1 ) * 4 )
210+
173211 def download_file (url , path , item_path ):
174212 try :
175213 content = http_get (url , headers )
176- path .parent .mkdir (parents = True , exist_ok = True )
177214 path .write_bytes (content )
178215 print (f"✓ { item_path } " )
179216 except Exception as e :
@@ -191,7 +228,6 @@ def download_file(url, path, item_path):
191228 except Exception :
192229 pass # Already logged
193230
194-
195231def fetch_gitlab (spec : RepoSpec , output : Path , token : Optional [str ] = None ) -> None :
196232 """Download from GitLab using Repository API."""
197233 token = token or os .getenv ("GITLAB_TOKEN" , "" )
0 commit comments