Added categorical limit to Tabular summary

VisLab · web-flow · commit d3898ac6867f · 2025-12-30T14:51:12.000-06:00
Added a categorical limit parameter to TabularSummary
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -49,8 +49,7 @@ jobs:
 
     - name: Build documentation
       run: |
-        cd docs
-        sphinx-build -b html . _build/html
+        sphinx-build -b html docs docs/_build/html
 
     - name: Setup Pages
       uses: actions/configure-pages@v5
diff --git a/.github/workflows/links.yaml b/.github/workflows/links.yaml
@@ -33,8 +33,7 @@ jobs:
 
       - name: Build documentation with Sphinx
         run: |
-          cd docs
-          sphinx-build -b html . _build/html
+          sphinx-build -b html docs docs/_build/html
 
       - name: Link Checker on built documentation
         id: lychee
diff --git a/docs/conf.py b/docs/conf.py
@@ -11,7 +11,7 @@
 author = "HED Standard"
 
 # The full version, including alpha/beta/rc tags
-release = "0.8.0"
+release = "0.8.1"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/hed/tools/analysis/tabular_summary.py b/hed/tools/analysis/tabular_summary.py
@@ -10,19 +10,22 @@
 class TabularSummary:
     """Summarize the contents of columnar files."""
 
-    def __init__(self, value_cols=None, skip_cols=None, name=""):
+    def __init__(self, value_cols=None, skip_cols=None, name="", categorical_limit=None):
         """Constructor for a BIDS tabular file summary.
 
         Parameters:
             value_cols (list, None):  List of columns to be treated as value columns.
             skip_cols (list, None):   List of columns to be skipped.
             name (str):               Name associated with the dictionary.
+            categorical_limit (int, None):  Maximum number of unique values to store for categorical columns.
 
         """
 
         self.name = name
         self.categorical_info = {}
         self.value_info = {}
+        self.categorical_counts = {}
+        self.categorical_limit = categorical_limit
         if value_cols and skip_cols and set(value_cols).intersection(skip_cols):
             raise HedFileError(
                 "ValueSkipOverlap", f"Value columns {str(value_cols)} and skip columns {str(skip_cols)} cannot overlap", ""
@@ -47,7 +50,10 @@ def __str__(self):
         for key in sorted_keys:
             value_dict = self.categorical_info[key]
             sorted_v_keys = sorted(value_dict)
-            summary_list.append(f"{indent * 2}{key} ({len(sorted_v_keys)} distinct values):")
+            counts = self.categorical_counts.get(key, [0, 0])
+            summary_list.append(
+                f"{indent * 2}{key} ({len(sorted_v_keys)} distinct values, {counts[0]} total values in {counts[1]} files):"
+            )
             for v_key in sorted_v_keys:
                 summary_list.append(f"{indent * 3}{v_key}: {value_dict[v_key]}")
 
@@ -101,9 +107,11 @@ def get_summary(self, as_json=False) -> Union[dict, str]:
             "Total events": self.total_events,
             "Total files": self.total_files,
             "Categorical columns": categorical_cols,
+            "Categorical counts": self.categorical_counts,
             "Value columns": value_cols,
             "Skip columns": self.skip_cols,
             "Files": self.files,
+            "Categorical limit": str(self.categorical_limit),
         }
         if as_json:
             return json.dumps(summary, indent=4)
@@ -131,7 +139,7 @@ def get_number_unique(self, column_names=None) -> dict:
         return counts
 
     def update(self, data, name=None):
-        """Update the counts based on data.
+        """Update the counts based on data (DataFrame, filename, or list of filenames).
 
         Parameters:
             data (DataFrame, str, or list):    DataFrame containing data to update.
@@ -166,19 +174,26 @@ def update_summary(self, tab_sum):
         self._update_dict_value(tab_sum)
         self._update_dict_categorical(tab_sum)
 
-    def _update_categorical(self, tab_name, values):
+    def _update_categorical(self, tab_name, values, cat_counts):
         """Update the categorical information for this summary.
 
         Parameters:
             tab_name (str): Name of a key indicating a categorical column.
             values (dict): A dictionary whose keys are unique categorical values.
+            cat_counts (list): A list with two elements: total count of values and number of entries.
 
         """
         if tab_name not in self.categorical_info:
             self.categorical_info[tab_name] = {}
-
+        if tab_name not in self.categorical_counts:
+            self.categorical_counts[tab_name] = [cat_counts[0], cat_counts[1]]
+        else:
+            self.categorical_counts[tab_name][0] += cat_counts[0]
+            self.categorical_counts[tab_name][1] += cat_counts[1]
         total_values = self.categorical_info[tab_name]
         for name, value in values.items():
+            if self.categorical_limit is not None and len(total_values) >= self.categorical_limit:
+                break
             value_list = total_values.get(name, [0, 0])
             if not isinstance(value, list):
                 value = [value, 1]
@@ -207,9 +222,15 @@ def _update_dataframe(self, data, name):
                 self.value_info[col_name][0] = self.value_info[col_name][0] + len(col_values)
                 self.value_info[col_name][1] = self.value_info[col_name][1] + 1
             else:
+                cat_counts = self.categorical_counts.get(col_name, [0, 0])
+                cat_counts[0] += len(col_values)
+                cat_counts[1] += 1
+                self.categorical_counts[col_name] = cat_counts
+                if self.categorical_limit is not None and len(col_values) > self.categorical_limit:
+                    continue
                 col_values = col_values.astype(str)
                 values = col_values.value_counts(ascending=True)
-                self._update_categorical(col_name, values)
+                self._update_categorical(col_name, values, cat_counts)
 
     def _update_dict_categorical(self, col_dict):
         """Update this summary with the categorical information in the dictionary from another summary.
@@ -228,7 +249,7 @@ def _update_dict_categorical(self, col_dict):
             elif col in self.skip_cols:
                 continue
             else:
-                self._update_categorical(col, col_dict.categorical_info[col])
+                self._update_categorical(col, col_dict.categorical_info[col], col_dict.categorical_counts.get(col, [0, 0]))
 
     def _update_dict_skip(self, col_dict):
         """Update this summary with the skip column information from another summary.
@@ -289,13 +310,15 @@ def extract_summary(summary_info) -> "TabularSummary":
         new_tab = TabularSummary(
             value_cols=summary_info.get("Value columns", {}).keys(),
             skip_cols=summary_info.get("Skip columns", []),
-            name=summary_info.get("Summary name", ""),
+            name=summary_info.get("Name", ""),
+            categorical_limit=summary_info.get("Categorical limit", None),
         )
-        new_tab.value_info = summary_info.get("Value_columns", {})
+        new_tab.value_info = summary_info.get("Value columns", {})
         new_tab.total_files = summary_info.get("Total files", 0)
         new_tab.total_events = summary_info.get("Total events", 0)
         new_tab.skip_cols = summary_info.get("Skip columns", [])
         new_tab.categorical_info = summary_info.get("Categorical columns", {})
+        new_tab.categorical_counts = summary_info.get("Categorical counts", {})
         new_tab.files = summary_info.get("Files", {})
         return new_tab
 
diff --git a/tests/tools/analysis/test_tabular_summary.py b/tests/tools/analysis/test_tabular_summary.py
@@ -80,7 +80,7 @@ def test_get_summary(self):
         )
         summary1 = dict1.get_summary(as_json=False)
         self.assertIsInstance(summary1, dict)
-        self.assertEqual(len(summary1), 7)
+        self.assertEqual(len(summary1), 9)
         summary2 = dict1.get_summary(as_json=True).replace('"', "")
         self.assertIsInstance(summary2, str)
 
@@ -240,6 +240,100 @@ def test_update_summary(self):
         self.assertEqual(len(files_bids), tab_all.total_files)
         self.assertEqual(len(files_bids) * 200, tab_all.total_events)
 
+    def test_categorical_limit_constructor(self):
+        # Test that categorical_limit can be set in constructor
+        dict1 = TabularSummary(categorical_limit=5)
+        self.assertEqual(dict1.categorical_limit, 5)
+
+        dict2 = TabularSummary(categorical_limit=None)
+        self.assertIsNone(dict2.categorical_limit)
+
+    def test_categorical_limit_enforced(self):
+        # Test that categorical_limit is enforced when updating
+        stern_df = get_new_dataframe(self.stern_map_path)
+
+        # Create a summary with no limit
+        dict_no_limit = TabularSummary()
+        dict_no_limit.update(stern_df)
+
+        # Create a summary with a limit of 2 unique values per column
+        dict_with_limit = TabularSummary(categorical_limit=2)
+        dict_with_limit.update(stern_df)
+
+        # Check that columns with more than 2 unique values are limited
+        for col_name in dict_with_limit.categorical_info:
+            self.assertLessEqual(
+                len(dict_with_limit.categorical_info[col_name]),
+                2,
+                f"Column {col_name} should have at most 2 unique values stored",
+            )
+            # But categorical_counts should track all values
+            self.assertIn(col_name, dict_with_limit.categorical_counts)
+            self.assertGreater(dict_with_limit.categorical_counts[col_name][0], 0)
+
+    def test_categorical_limit_columns_with_many_values(self):
+        # Test that columns with many values are skipped during initial update
+        wh_df = get_new_dataframe(self.wh_events_path)
+
+        # Set limit to 5
+        dict1 = TabularSummary(categorical_limit=5)
+        dict1.update(wh_df)
+
+        # Columns with more than 5 unique values at collection time should still be tracked in counts
+        for col_name, counts in dict1.categorical_counts.items():
+            self.assertGreater(counts[0], 0, f"Column {col_name} should have event count > 0")
+            self.assertEqual(counts[1], 1, f"Column {col_name} should have been updated once")
+
+    def test_categorical_limit_in_summary(self):
+        # Test that categorical_limit appears in the summary output
+        dict1 = TabularSummary(categorical_limit=10)
+        stern_df = get_new_dataframe(self.stern_map_path)
+        dict1.update(stern_df)
+
+        summary = dict1.get_summary(as_json=False)
+        self.assertIn("Categorical limit", summary)
+        self.assertEqual(summary["Categorical limit"], "10")
+
+        # Test with None
+        dict2 = TabularSummary()
+        dict2.update(stern_df)
+        summary2 = dict2.get_summary(as_json=False)
+        self.assertEqual(summary2["Categorical limit"], "None")
+
+    def test_categorical_limit_extract_summary(self):
+        # Test that categorical_limit is preserved through extract_summary
+        dict1 = TabularSummary(categorical_limit=15)
+        stern_df = get_new_dataframe(self.stern_map_path)
+        dict1.update(stern_df)
+
+        summary_info = dict1.get_summary(as_json=False)
+        dict2 = TabularSummary.extract_summary(summary_info)
+
+        # Note: extract_summary doesn't restore categorical_limit currently,
+        # but it should at least not error
+        self.assertIsInstance(dict2, TabularSummary)
+
+    def test_categorical_limit_update_dict(self):
+        # Test that categorical_limit works correctly with update_summary
+        stern_df = get_new_dataframe(self.stern_test1_path)
+
+        dict1 = TabularSummary(categorical_limit=3)
+        dict1.update(stern_df)
+
+        dict2 = TabularSummary(categorical_limit=3)
+        dict2.update(stern_df)
+
+        # Update dict1 with dict2
+        dict1.update_summary(dict2)
+
+        # Check that limits are still enforced
+        for col_name in dict1.categorical_info:
+            self.assertLessEqual(
+                len(dict1.categorical_info[col_name]),
+                3,
+                f"Column {col_name} should have at most 3 unique values after update_summary",
+            )
+
 
 if __name__ == "__main__":
     unittest.main()