Skip to content

Commit d3898ac

Browse files
authored
Added categorical limit to Tabular summary
Added a categorical limit parameter to TabularSummary
2 parents b274430 + f142bcc commit d3898ac

File tree

5 files changed

+130
-15
lines changed

5 files changed

+130
-15
lines changed

.github/workflows/docs.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,7 @@ jobs:
4949
5050
- name: Build documentation
5151
run: |
52-
cd docs
53-
sphinx-build -b html . _build/html
52+
sphinx-build -b html docs docs/_build/html
5453
5554
- name: Setup Pages
5655
uses: actions/configure-pages@v5

.github/workflows/links.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,7 @@ jobs:
3333
3434
- name: Build documentation with Sphinx
3535
run: |
36-
cd docs
37-
sphinx-build -b html . _build/html
36+
sphinx-build -b html docs docs/_build/html
3837
3938
- name: Link Checker on built documentation
4039
id: lychee

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
author = "HED Standard"
1212

1313
# The full version, including alpha/beta/rc tags
14-
release = "0.8.0"
14+
release = "0.8.1"
1515

1616
# -- General configuration ---------------------------------------------------
1717
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

hed/tools/analysis/tabular_summary.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,22 @@
1010
class TabularSummary:
1111
"""Summarize the contents of columnar files."""
1212

13-
def __init__(self, value_cols=None, skip_cols=None, name=""):
13+
def __init__(self, value_cols=None, skip_cols=None, name="", categorical_limit=None):
1414
"""Constructor for a BIDS tabular file summary.
1515
1616
Parameters:
1717
value_cols (list, None): List of columns to be treated as value columns.
1818
skip_cols (list, None): List of columns to be skipped.
1919
name (str): Name associated with the dictionary.
20+
categorical_limit (int, None): Maximum number of unique values to store for categorical columns.
2021
2122
"""
2223

2324
self.name = name
2425
self.categorical_info = {}
2526
self.value_info = {}
27+
self.categorical_counts = {}
28+
self.categorical_limit = categorical_limit
2629
if value_cols and skip_cols and set(value_cols).intersection(skip_cols):
2730
raise HedFileError(
2831
"ValueSkipOverlap", f"Value columns {str(value_cols)} and skip columns {str(skip_cols)} cannot overlap", ""
@@ -47,7 +50,10 @@ def __str__(self):
4750
for key in sorted_keys:
4851
value_dict = self.categorical_info[key]
4952
sorted_v_keys = sorted(value_dict)
50-
summary_list.append(f"{indent * 2}{key} ({len(sorted_v_keys)} distinct values):")
53+
counts = self.categorical_counts.get(key, [0, 0])
54+
summary_list.append(
55+
f"{indent * 2}{key} ({len(sorted_v_keys)} distinct values, {counts[0]} total values in {counts[1]} files):"
56+
)
5157
for v_key in sorted_v_keys:
5258
summary_list.append(f"{indent * 3}{v_key}: {value_dict[v_key]}")
5359

@@ -101,9 +107,11 @@ def get_summary(self, as_json=False) -> Union[dict, str]:
101107
"Total events": self.total_events,
102108
"Total files": self.total_files,
103109
"Categorical columns": categorical_cols,
110+
"Categorical counts": self.categorical_counts,
104111
"Value columns": value_cols,
105112
"Skip columns": self.skip_cols,
106113
"Files": self.files,
114+
"Categorical limit": str(self.categorical_limit),
107115
}
108116
if as_json:
109117
return json.dumps(summary, indent=4)
@@ -131,7 +139,7 @@ def get_number_unique(self, column_names=None) -> dict:
131139
return counts
132140

133141
def update(self, data, name=None):
134-
"""Update the counts based on data.
142+
"""Update the counts based on data (DataFrame, filename, or list of filenames).
135143
136144
Parameters:
137145
data (DataFrame, str, or list): DataFrame containing data to update.
@@ -166,19 +174,26 @@ def update_summary(self, tab_sum):
166174
self._update_dict_value(tab_sum)
167175
self._update_dict_categorical(tab_sum)
168176

169-
def _update_categorical(self, tab_name, values):
177+
def _update_categorical(self, tab_name, values, cat_counts):
170178
"""Update the categorical information for this summary.
171179
172180
Parameters:
173181
tab_name (str): Name of a key indicating a categorical column.
174182
values (dict): A dictionary whose keys are unique categorical values.
183+
cat_counts (list): A list with two elements: total count of values and number of entries.
175184
176185
"""
177186
if tab_name not in self.categorical_info:
178187
self.categorical_info[tab_name] = {}
179-
188+
if tab_name not in self.categorical_counts:
189+
self.categorical_counts[tab_name] = [cat_counts[0], cat_counts[1]]
190+
else:
191+
self.categorical_counts[tab_name][0] += cat_counts[0]
192+
self.categorical_counts[tab_name][1] += cat_counts[1]
180193
total_values = self.categorical_info[tab_name]
181194
for name, value in values.items():
195+
if self.categorical_limit is not None and len(total_values) >= self.categorical_limit:
196+
break
182197
value_list = total_values.get(name, [0, 0])
183198
if not isinstance(value, list):
184199
value = [value, 1]
@@ -207,9 +222,15 @@ def _update_dataframe(self, data, name):
207222
self.value_info[col_name][0] = self.value_info[col_name][0] + len(col_values)
208223
self.value_info[col_name][1] = self.value_info[col_name][1] + 1
209224
else:
225+
cat_counts = self.categorical_counts.get(col_name, [0, 0])
226+
cat_counts[0] += len(col_values)
227+
cat_counts[1] += 1
228+
self.categorical_counts[col_name] = cat_counts
229+
if self.categorical_limit is not None and len(col_values) > self.categorical_limit:
230+
continue
210231
col_values = col_values.astype(str)
211232
values = col_values.value_counts(ascending=True)
212-
self._update_categorical(col_name, values)
233+
self._update_categorical(col_name, values, cat_counts)
213234

214235
def _update_dict_categorical(self, col_dict):
215236
"""Update this summary with the categorical information in the dictionary from another summary.
@@ -228,7 +249,7 @@ def _update_dict_categorical(self, col_dict):
228249
elif col in self.skip_cols:
229250
continue
230251
else:
231-
self._update_categorical(col, col_dict.categorical_info[col])
252+
self._update_categorical(col, col_dict.categorical_info[col], col_dict.categorical_counts.get(col, [0, 0]))
232253

233254
def _update_dict_skip(self, col_dict):
234255
"""Update this summary with the skip column information from another summary.
@@ -289,13 +310,15 @@ def extract_summary(summary_info) -> "TabularSummary":
289310
new_tab = TabularSummary(
290311
value_cols=summary_info.get("Value columns", {}).keys(),
291312
skip_cols=summary_info.get("Skip columns", []),
292-
name=summary_info.get("Summary name", ""),
313+
name=summary_info.get("Name", ""),
314+
categorical_limit=summary_info.get("Categorical limit", None),
293315
)
294-
new_tab.value_info = summary_info.get("Value_columns", {})
316+
new_tab.value_info = summary_info.get("Value columns", {})
295317
new_tab.total_files = summary_info.get("Total files", 0)
296318
new_tab.total_events = summary_info.get("Total events", 0)
297319
new_tab.skip_cols = summary_info.get("Skip columns", [])
298320
new_tab.categorical_info = summary_info.get("Categorical columns", {})
321+
new_tab.categorical_counts = summary_info.get("Categorical counts", {})
299322
new_tab.files = summary_info.get("Files", {})
300323
return new_tab
301324

tests/tools/analysis/test_tabular_summary.py

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def test_get_summary(self):
8080
)
8181
summary1 = dict1.get_summary(as_json=False)
8282
self.assertIsInstance(summary1, dict)
83-
self.assertEqual(len(summary1), 7)
83+
self.assertEqual(len(summary1), 9)
8484
summary2 = dict1.get_summary(as_json=True).replace('"', "")
8585
self.assertIsInstance(summary2, str)
8686

@@ -240,6 +240,100 @@ def test_update_summary(self):
240240
self.assertEqual(len(files_bids), tab_all.total_files)
241241
self.assertEqual(len(files_bids) * 200, tab_all.total_events)
242242

243+
def test_categorical_limit_constructor(self):
244+
# Test that categorical_limit can be set in constructor
245+
dict1 = TabularSummary(categorical_limit=5)
246+
self.assertEqual(dict1.categorical_limit, 5)
247+
248+
dict2 = TabularSummary(categorical_limit=None)
249+
self.assertIsNone(dict2.categorical_limit)
250+
251+
def test_categorical_limit_enforced(self):
252+
# Test that categorical_limit is enforced when updating
253+
stern_df = get_new_dataframe(self.stern_map_path)
254+
255+
# Create a summary with no limit
256+
dict_no_limit = TabularSummary()
257+
dict_no_limit.update(stern_df)
258+
259+
# Create a summary with a limit of 2 unique values per column
260+
dict_with_limit = TabularSummary(categorical_limit=2)
261+
dict_with_limit.update(stern_df)
262+
263+
# Check that columns with more than 2 unique values are limited
264+
for col_name in dict_with_limit.categorical_info:
265+
self.assertLessEqual(
266+
len(dict_with_limit.categorical_info[col_name]),
267+
2,
268+
f"Column {col_name} should have at most 2 unique values stored",
269+
)
270+
# But categorical_counts should track all values
271+
self.assertIn(col_name, dict_with_limit.categorical_counts)
272+
self.assertGreater(dict_with_limit.categorical_counts[col_name][0], 0)
273+
274+
def test_categorical_limit_columns_with_many_values(self):
275+
# Test that columns with many values are skipped during initial update
276+
wh_df = get_new_dataframe(self.wh_events_path)
277+
278+
# Set limit to 5
279+
dict1 = TabularSummary(categorical_limit=5)
280+
dict1.update(wh_df)
281+
282+
# Columns with more than 5 unique values at collection time should still be tracked in counts
283+
for col_name, counts in dict1.categorical_counts.items():
284+
self.assertGreater(counts[0], 0, f"Column {col_name} should have event count > 0")
285+
self.assertEqual(counts[1], 1, f"Column {col_name} should have been updated once")
286+
287+
def test_categorical_limit_in_summary(self):
288+
# Test that categorical_limit appears in the summary output
289+
dict1 = TabularSummary(categorical_limit=10)
290+
stern_df = get_new_dataframe(self.stern_map_path)
291+
dict1.update(stern_df)
292+
293+
summary = dict1.get_summary(as_json=False)
294+
self.assertIn("Categorical limit", summary)
295+
self.assertEqual(summary["Categorical limit"], "10")
296+
297+
# Test with None
298+
dict2 = TabularSummary()
299+
dict2.update(stern_df)
300+
summary2 = dict2.get_summary(as_json=False)
301+
self.assertEqual(summary2["Categorical limit"], "None")
302+
303+
def test_categorical_limit_extract_summary(self):
304+
# Test that categorical_limit is preserved through extract_summary
305+
dict1 = TabularSummary(categorical_limit=15)
306+
stern_df = get_new_dataframe(self.stern_map_path)
307+
dict1.update(stern_df)
308+
309+
summary_info = dict1.get_summary(as_json=False)
310+
dict2 = TabularSummary.extract_summary(summary_info)
311+
312+
# Note: extract_summary doesn't restore categorical_limit currently,
313+
# but it should at least not error
314+
self.assertIsInstance(dict2, TabularSummary)
315+
316+
def test_categorical_limit_update_dict(self):
317+
# Test that categorical_limit works correctly with update_summary
318+
stern_df = get_new_dataframe(self.stern_test1_path)
319+
320+
dict1 = TabularSummary(categorical_limit=3)
321+
dict1.update(stern_df)
322+
323+
dict2 = TabularSummary(categorical_limit=3)
324+
dict2.update(stern_df)
325+
326+
# Update dict1 with dict2
327+
dict1.update_summary(dict2)
328+
329+
# Check that limits are still enforced
330+
for col_name in dict1.categorical_info:
331+
self.assertLessEqual(
332+
len(dict1.categorical_info[col_name]),
333+
3,
334+
f"Column {col_name} should have at most 3 unique values after update_summary",
335+
)
336+
243337

244338
if __name__ == "__main__":
245339
unittest.main()

0 commit comments

Comments
 (0)