sigp · michaelsproul · Jun 26, 2025 · Jun 26, 2025 · Jun 27, 2025
diff --git a/classifier.py b/classifier.py
@@ -7,6 +7,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pickle
+import plotly.graph_objects as go
 
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.neural_network import MLPClassifier
@@ -26,12 +27,12 @@
 
 DEFAULT_FEATURES = [
     "percent_redundant_boost",
-    "difflib_rewards",
     "difflib_slot",
-    "difflib_slot_rev",
+    "spearman_correlation",
+    "norm_reward",
 ]
 
-DEFAULT_GRAFFITI_ONLY = ["Lodestar"]
+DEFAULT_GRAFFITI_ONLY = ["Grandine", "Lodestar"]  # too hard rn
 
 VIABLE_FEATURES = [
     "percent_redundant_boost",
@@ -183,7 +184,7 @@ def plot_feature_matrix(self, output_path):
         z = self.feature_matrix[:, 2]
 
         scatter = ax.scatter(
-            x, y, z, c=self.training_labels, marker=".", alpha=0.25, cmap="Set1"
+            x, y, z, c=self.training_labels, marker=".", alpha=0.25, cmap="Dark2"
         )
 
         handles, _ = scatter.legend_elements()
@@ -204,6 +205,45 @@ def plot_feature_matrix(self, output_path):
         else:
             fig.savefig(output_path)
 
+    def plot_feature_matrix_interactive(self, output_path):
+        int_to_client_name = {i: client for (i, client) in enumerate(CLIENTS)}
+        text = [int_to_client_name[i] for i in self.training_labels]
+
+        fig = go.Figure(
+            data=go.Scatter3d(
+                x=self.feature_matrix[:, 0],
+                y=self.feature_matrix[:, 1],
+                z=self.feature_matrix[:, 2],
+                mode="markers",
+                marker=dict(
+                    size=5,
+                    color=self.training_labels,
+                    colorscale="ylgnbu",
+                    opacity=0.8,
+                    colorbar=dict(labelalias=int_to_client_name),
+                ),
+                text=text,  # hover text
+                hovertemplate="%{text}<br>"
+                + f"{self.features[0]}: %{{x}}<br>"
+                + f"{self.features[1]}: %{{y}}<br>"
+                + f"{self.features[2]}: %{{z}}<extra></extra>",
+            )
+        )
+
+        fig.update_layout(
+            scene=dict(
+                xaxis_title=self.features[0],
+                yaxis_title=self.features[1],
+                zaxis_title=self.features[2],
+            ),
+            title="3D Feature Matrix",
+        )
+
+        if output_path is None:
+            fig.show()
+        else:
+            fig.write_html(output_path)  # Creates interactive HTML file
+
 
 def compute_guess_list(probability_map, enabled_clients) -> list:
     guesses = []
@@ -364,7 +404,7 @@ def main():
     )
 
     if args.plot is not None:
-        classifier.plot_feature_matrix(args.plot)
+        classifier.plot_feature_matrix_interactive(args.plot)
         print("plot of training data written to {}".format(args.plot))
 
     frequency_map = {}

diff --git a/feature_selection.py b/feature_selection.py
@@ -103,6 +103,32 @@ def feat_spearman_correlation(block_reward):
         ).correlation
 
 
+def feat_spearman_correlation_lodestar(block_reward):
+    """Spearman correlation coefficient for the per attestation rewards vs their sorted version
+
+    This variant sorts by total_rewards / inclusion distance, which is what Lodestar uses.
+    """
+    per_attestation_rewards = block_reward["attestation_rewards"][
+        "per_attestation_rewards"
+    ]
+    slot = int(block_reward["meta"]["slot"])
+    attestation_data = block_reward["attestation_rewards"].get("attestations") or []
+    inclusion_distances = [int(att["slot"]) - slot for att in attestation_data]
+    attestation_totals = [
+        sum(rewards.values()) / inclusion_distances[i]
+        for i, rewards in enumerate(per_attestation_rewards)
+    ]
+    sorted_attestation_totals = sorted(attestation_totals)
+    # Spearman coefficient isn't defined for uniform/constant sequences, so we just default
+    # that to 1.0
+    if attestation_totals == sorted_attestation_totals:
+        return 1.0
+    else:
+        return scipy.stats.spearmanr(
+            attestation_totals, sorted_attestation_totals
+        ).correlation
+
+
 def feat_total_reward(block_reward):
     total_reward = block_reward["attestation_rewards"]["total"]
     return total_reward
@@ -134,6 +160,17 @@ def feat_median_density(block_reward):
     return safe_median(densities)
 
 
+def feat_median_density_electra(block_reward):
+    per_attestation_rewards = block_reward["attestation_rewards"][
+        "per_attestation_rewards"
+    ]
+    densities = [
+        len(rewards) / (32 * TARGET_COMMITTEE_SIZE)
+        for rewards in per_attestation_rewards
+    ]
+    return safe_median(densities)
+
+
 def feat_mean_density(block_reward):
     per_attestation_rewards = block_reward["attestation_rewards"][
         "per_attestation_rewards"
@@ -206,11 +243,13 @@ def f(block_reward):
         lambda x: (x[0], x[3]), reverse=True
     ),
     "spearman_correlation": feat_spearman_correlation,
+    "spearman_correlation_lodestar": feat_spearman_correlation_lodestar,
     "reward": feat_total_reward,
     "norm_reward": feat_total_reward_norm,
     "norm_reward_per_slot": scale_by_num_slots(feat_total_reward_norm),
     "reward_per_attestation": scale_by_num_attestations(feat_total_reward),
     "median_density": feat_median_density,
+    "median_density_electra": feat_median_density_electra,
     "mean_density": feat_mean_density,
     "num_single_bit": feat_num_single_bit,
     "percent_single_bit": scale_by_num_attestations(feat_num_single_bit),

diff --git a/prepare_training_data.py b/prepare_training_data.py
@@ -15,7 +15,7 @@
 CLIENTS = ["Grandine", "Lighthouse", "Lodestar", "Nimbus", "Other", "Prysm", "Teku"]
 
 REGEX_PATTERNS = {
-    "Grandine": [],
+    "Grandine": [r".*[Gg]randine.*"],
     "Lighthouse": [r".*[Ll]ighthouse", r"RP-[A-Z]?L v[0-9]*\.[0-9]*\.[0-9]*.*"],
     "Teku": [r".*[Tt]eku", r"RP-[A-Z]?T v[0-9]*\.[0-9]*\.[0-9]*.*"],
     "Nimbus": [r".*[Nn]imbus", r"RP-[A-Z]?N v[0-9]*\.[0-9]*\.[0-9]*.*"],
@@ -62,6 +62,7 @@ def process_file(
     raw_data_dir: str, proc_data_dir: str, disabled_clients: list[str], file_name: str
 ) -> None:
     with open(os.path.join(raw_data_dir, file_name), "r") as f:
+        print(f"Processing {file_name}")
         rewards = json.load(f)
 
     res = classify_rewards_by_graffiti(rewards, disabled_clients=disabled_clients)

diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ sseclient-py==1.8.0
 gunicorn==21.2.0
 matplotlib==3.8.0
 scipy==1.11.3
+plotly==6.1.2