44
55import logging
66from importlib import import_module
7+ from pathlib import Path
78from typing import TYPE_CHECKING , Dict , Iterable , Mapping , Optional
89
10+ import fsspec
11+ import pyarrow as pa
12+
913if TYPE_CHECKING :
10- from pathlib import Path
1114 from types import ModuleType
1215
13- import pyarrow as pa
14-
1516 from .config import KnowledgeGraphConfig
1617
1718
@@ -23,38 +24,80 @@ class LanceGraphStore:
2324
2425 def __init__ (self , config : "KnowledgeGraphConfig" ):
2526 self ._config = config
26- self ._root : " Path" = config .storage_path
27+ self ._root : Path | str = config .storage_path
2728 self ._lance : Optional [ModuleType ] = None
2829 self ._lance_attempted = False
2930
31+ # Initialize filesystem interface
32+ # We convert to string to ensure compatibility with fsspec, but we'll
33+ # use self._root (the original type) when reconstructing return values.
34+ try :
35+ self ._fs , self ._fs_path = fsspec .core .url_to_fs (
36+ str (self ._root ), ** (self .config .storage_options or {})
37+ )
38+ except ImportError :
39+ # Re-raise explicit ImportError if protocol driver (e.g. gcsfs, s3fs)
40+ # is missing
41+ raise
42+
3043 @property
3144 def config (self ) -> "KnowledgeGraphConfig" :
3245 """Return the configuration backing this store."""
3346 return self ._config
3447
3548 @property
36- def root (self ) -> " Path" :
49+ def root (self ) -> Path | str :
3750 """Return the root path for persisted datasets."""
3851 return self ._root
3952
4053 def ensure_layout (self ) -> None :
4154 """Create the storage layout if it does not already exist."""
42- self ._root .mkdir (parents = True , exist_ok = True )
43-
44- def list_datasets (self ) -> Dict [str , "Path" ]:
55+ try :
56+ self ._fs .makedirs (self ._fs_path , exist_ok = True )
57+ except Exception :
58+ # S3/GCS might not support directory creation or it might be implicit.
59+ # We treat failure here as non-fatal if the path is actually accessible
60+ # later,
61+ # but usually makedirs is safe on object stores (no-op).
62+ pass
63+
64+ def list_datasets (self ) -> Dict [str , Path | str ]:
4565 """Enumerate known Lance datasets."""
46- datasets : Dict [str , Path ] = {}
47- if not self ._root .exists ():
48- return datasets
49- for child in self ._root .iterdir ():
50- if child .is_dir () and child .suffix == ".lance" :
51- datasets [child .stem ] = child
66+ datasets : Dict [str , Path | str ] = {}
67+
68+ try :
69+ if not self ._fs .exists (self ._fs_path ):
70+ return datasets
71+ infos = self ._fs .ls (self ._fs_path , detail = True )
72+ except Exception as e :
73+ # We want to swallow "not found" errors but raise others (like Auth errors)
74+ if isinstance (e , FileNotFoundError ):
75+ return datasets
76+
77+ msg = str (e ).lower ()
78+ if "not found" in msg or "no such file" in msg or "does not exist" in msg :
79+ return datasets
80+ raise
81+
82+ root_str = str (self ._root )
83+ for info in infos :
84+ name = info ["name" ].rstrip ("/" )
85+ base_name = name .split ("/" )[- 1 ]
86+ if info ["type" ] == "directory" and base_name .endswith (".lance" ):
87+ dataset_name = base_name [:- 6 ]
88+ full_path = f"{ root_str .rstrip ('/' )} /{ base_name } "
89+ if isinstance (self ._root , Path ):
90+ datasets [dataset_name ] = Path (full_path )
91+ else :
92+ datasets [dataset_name ] = full_path
5293 return datasets
5394
54- def _dataset_path (self , name : str ) -> " Path" :
95+ def _dataset_path (self , name : str ) -> Path | str :
5596 """Create the canonical path for a dataset."""
5697 safe_name = name .replace ("/" , "_" )
57- return self ._root / f"{ safe_name } .lance"
98+ if isinstance (self ._root , Path ):
99+ return self ._root / f"{ safe_name } .lance"
100+ return f"{ self ._root .rstrip ('/' )} /{ safe_name } .lance"
58101
59102 def _get_lance (self ) -> ModuleType :
60103 if not self ._lance_attempted :
@@ -77,6 +120,20 @@ def _get_lance(self) -> ModuleType:
77120 raise ImportError ("Lance module failed to load" )
78121 return self ._lance
79122
123+ def _path_exists (self , path : Path | str ) -> bool :
124+ if isinstance (path , Path ):
125+ return path .exists ()
126+ try :
127+ fs , p = fsspec .core .url_to_fs (path )
128+ except Exception :
129+ # If we cannot resolve the filesystem (e.g. missing gcsfs), we should raise
130+ # rather than assuming the path does not exist.
131+ raise
132+ try :
133+ return fs .exists (p )
134+ except Exception :
135+ return False
136+
80137 def load_tables (
81138 self ,
82139 names : Optional [Iterable [str ]] = None ,
@@ -91,17 +148,18 @@ def load_tables(
91148 tables : Dict [str , "pa.Table" ] = {}
92149 for name in requested :
93150 path = available .get (name , self ._dataset_path (name ))
94- if not path . exists ( ):
151+ if not self . _path_exists ( path ):
95152 raise FileNotFoundError (f"Dataset '{ name } ' not found at { path } " )
96- dataset = lance .dataset (str (path ))
153+ dataset = lance .dataset (
154+ str (path ), storage_options = self .config .storage_options
155+ )
97156 table = dataset .scanner ().to_table ()
98157 tables [name ] = table
99158 return tables
100159
101160 def write_tables (self , tables : Mapping [str , "pa.Table" ]) -> None :
102161 """Persist PyArrow tables as Lance datasets."""
103162 lance = self ._get_lance ()
104- import pyarrow as pa # Local import; optional dependency
105163
106164 self .ensure_layout ()
107165 for name , table in tables .items ():
@@ -110,5 +168,7 @@ def write_tables(self, tables: Mapping[str, "pa.Table"]) -> None:
110168 f"Dataset '{ name } ' must be a pyarrow.Table (got { type (table )!r} )"
111169 )
112170 path = self ._dataset_path (name )
113- mode = "overwrite" if path .exists () else "create"
114- lance .write_dataset (table , str (path ), mode = mode )
171+ mode = "overwrite" if self ._path_exists (path ) else "create"
172+ lance .write_dataset (
173+ table , str (path ), mode = mode , storage_options = self .config .storage_options
174+ )
0 commit comments