1010class TabularSummary :
1111 """Summarize the contents of columnar files."""
1212
13- def __init__ (self , value_cols = None , skip_cols = None , name = "" ):
13+ def __init__ (self , value_cols = None , skip_cols = None , name = "" , categorical_limit = None ):
1414 """Constructor for a BIDS tabular file summary.
1515
1616 Parameters:
1717 value_cols (list, None): List of columns to be treated as value columns.
1818 skip_cols (list, None): List of columns to be skipped.
1919 name (str): Name associated with the dictionary.
20+ categorical_limit (int, None): Maximum number of unique values to store for categorical columns.
2021
2122 """
2223
2324 self .name = name
2425 self .categorical_info = {}
2526 self .value_info = {}
27+ self .categorical_counts = {}
28+ self .categorical_limit = categorical_limit
2629 if value_cols and skip_cols and set (value_cols ).intersection (skip_cols ):
2730 raise HedFileError (
2831 "ValueSkipOverlap" , f"Value columns { str (value_cols )} and skip columns { str (skip_cols )} cannot overlap" , ""
@@ -47,7 +50,10 @@ def __str__(self):
4750 for key in sorted_keys :
4851 value_dict = self .categorical_info [key ]
4952 sorted_v_keys = sorted (value_dict )
50- summary_list .append (f"{ indent * 2 } { key } ({ len (sorted_v_keys )} distinct values):" )
53+ counts = self .categorical_counts .get (key , [0 , 0 ])
54+ summary_list .append (
55+ f"{ indent * 2 } { key } ({ len (sorted_v_keys )} distinct values, { counts [0 ]} total values in { counts [1 ]} files):"
56+ )
5157 for v_key in sorted_v_keys :
5258 summary_list .append (f"{ indent * 3 } { v_key } : { value_dict [v_key ]} " )
5359
@@ -101,9 +107,11 @@ def get_summary(self, as_json=False) -> Union[dict, str]:
101107 "Total events" : self .total_events ,
102108 "Total files" : self .total_files ,
103109 "Categorical columns" : categorical_cols ,
110+ "Categorical counts" : self .categorical_counts ,
104111 "Value columns" : value_cols ,
105112 "Skip columns" : self .skip_cols ,
106113 "Files" : self .files ,
114+ "Categorical limit" : str (self .categorical_limit ),
107115 }
108116 if as_json :
109117 return json .dumps (summary , indent = 4 )
@@ -131,7 +139,7 @@ def get_number_unique(self, column_names=None) -> dict:
131139 return counts
132140
133141 def update (self , data , name = None ):
134- """Update the counts based on data.
142+ """Update the counts based on data (DataFrame, filename, or list of filenames) .
135143
136144 Parameters:
137145 data (DataFrame, str, or list): DataFrame containing data to update.
@@ -166,19 +174,26 @@ def update_summary(self, tab_sum):
166174 self ._update_dict_value (tab_sum )
167175 self ._update_dict_categorical (tab_sum )
168176
169- def _update_categorical (self , tab_name , values ):
177+ def _update_categorical (self , tab_name , values , cat_counts ):
170178 """Update the categorical information for this summary.
171179
172180 Parameters:
173181 tab_name (str): Name of a key indicating a categorical column.
174182 values (dict): A dictionary whose keys are unique categorical values.
183+ cat_counts (list): A list with two elements: total count of values and number of entries.
175184
176185 """
177186 if tab_name not in self .categorical_info :
178187 self .categorical_info [tab_name ] = {}
179-
188+ if tab_name not in self .categorical_counts :
189+ self .categorical_counts [tab_name ] = [cat_counts [0 ], cat_counts [1 ]]
190+ else :
191+ self .categorical_counts [tab_name ][0 ] += cat_counts [0 ]
192+ self .categorical_counts [tab_name ][1 ] += cat_counts [1 ]
180193 total_values = self .categorical_info [tab_name ]
181194 for name , value in values .items ():
195+ if self .categorical_limit is not None and len (total_values ) >= self .categorical_limit :
196+ break
182197 value_list = total_values .get (name , [0 , 0 ])
183198 if not isinstance (value , list ):
184199 value = [value , 1 ]
@@ -207,9 +222,15 @@ def _update_dataframe(self, data, name):
207222 self .value_info [col_name ][0 ] = self .value_info [col_name ][0 ] + len (col_values )
208223 self .value_info [col_name ][1 ] = self .value_info [col_name ][1 ] + 1
209224 else :
225+ cat_counts = self .categorical_counts .get (col_name , [0 , 0 ])
226+ cat_counts [0 ] += len (col_values )
227+ cat_counts [1 ] += 1
228+ self .categorical_counts [col_name ] = cat_counts
229+ if self .categorical_limit is not None and len (col_values ) > self .categorical_limit :
230+ continue
210231 col_values = col_values .astype (str )
211232 values = col_values .value_counts (ascending = True )
212- self ._update_categorical (col_name , values )
233+ self ._update_categorical (col_name , values , cat_counts )
213234
214235 def _update_dict_categorical (self , col_dict ):
215236 """Update this summary with the categorical information in the dictionary from another summary.
@@ -228,7 +249,7 @@ def _update_dict_categorical(self, col_dict):
228249 elif col in self .skip_cols :
229250 continue
230251 else :
231- self ._update_categorical (col , col_dict .categorical_info [col ])
252+ self ._update_categorical (col , col_dict .categorical_info [col ], col_dict . categorical_counts . get ( col , [ 0 , 0 ]) )
232253
233254 def _update_dict_skip (self , col_dict ):
234255 """Update this summary with the skip column information from another summary.
@@ -289,13 +310,15 @@ def extract_summary(summary_info) -> "TabularSummary":
289310 new_tab = TabularSummary (
290311 value_cols = summary_info .get ("Value columns" , {}).keys (),
291312 skip_cols = summary_info .get ("Skip columns" , []),
292- name = summary_info .get ("Summary name" , "" ),
313+ name = summary_info .get ("Name" , "" ),
314+ categorical_limit = summary_info .get ("Categorical limit" , None ),
293315 )
294- new_tab .value_info = summary_info .get ("Value_columns " , {})
316+ new_tab .value_info = summary_info .get ("Value columns " , {})
295317 new_tab .total_files = summary_info .get ("Total files" , 0 )
296318 new_tab .total_events = summary_info .get ("Total events" , 0 )
297319 new_tab .skip_cols = summary_info .get ("Skip columns" , [])
298320 new_tab .categorical_info = summary_info .get ("Categorical columns" , {})
321+ new_tab .categorical_counts = summary_info .get ("Categorical counts" , {})
299322 new_tab .files = summary_info .get ("Files" , {})
300323 return new_tab
301324
0 commit comments