diff --git a/classifier.py b/classifier.py
index 4252da9..59ff0fb 100755
--- a/classifier.py
+++ b/classifier.py
@@ -7,6 +7,7 @@
import numpy as np
import matplotlib.pyplot as plt
import pickle
+import plotly.graph_objects as go
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
@@ -26,12 +27,12 @@
DEFAULT_FEATURES = [
"percent_redundant_boost",
- "difflib_rewards",
"difflib_slot",
- "difflib_slot_rev",
+ "spearman_correlation",
+ "norm_reward",
]
-DEFAULT_GRAFFITI_ONLY = ["Lodestar"]
+DEFAULT_GRAFFITI_ONLY = ["Grandine", "Lodestar"] # too hard rn
VIABLE_FEATURES = [
"percent_redundant_boost",
@@ -183,7 +184,7 @@ def plot_feature_matrix(self, output_path):
z = self.feature_matrix[:, 2]
scatter = ax.scatter(
- x, y, z, c=self.training_labels, marker=".", alpha=0.25, cmap="Set1"
+ x, y, z, c=self.training_labels, marker=".", alpha=0.25, cmap="Dark2"
)
handles, _ = scatter.legend_elements()
@@ -204,6 +205,45 @@ def plot_feature_matrix(self, output_path):
else:
fig.savefig(output_path)
+ def plot_feature_matrix_interactive(self, output_path):
+ int_to_client_name = {i: client for (i, client) in enumerate(CLIENTS)}
+ text = [int_to_client_name[i] for i in self.training_labels]
+
+ fig = go.Figure(
+ data=go.Scatter3d(
+ x=self.feature_matrix[:, 0],
+ y=self.feature_matrix[:, 1],
+ z=self.feature_matrix[:, 2],
+ mode="markers",
+ marker=dict(
+ size=5,
+ color=self.training_labels,
+ colorscale="ylgnbu",
+ opacity=0.8,
+ colorbar=dict(labelalias=int_to_client_name),
+ ),
+ text=text, # hover text
+ hovertemplate="%{text}
"
+ + f"{self.features[0]}: %{{x}}
"
+ + f"{self.features[1]}: %{{y}}
"
+ + f"{self.features[2]}: %{{z}}",
+ )
+ )
+
+ fig.update_layout(
+ scene=dict(
+ xaxis_title=self.features[0],
+ yaxis_title=self.features[1],
+ zaxis_title=self.features[2],
+ ),
+ title="3D Feature Matrix",
+ )
+
+ if output_path is None:
+ fig.show()
+ else:
+ fig.write_html(output_path) # Creates interactive HTML file
+
def compute_guess_list(probability_map, enabled_clients) -> list:
guesses = []
@@ -364,7 +404,7 @@ def main():
)
if args.plot is not None:
- classifier.plot_feature_matrix(args.plot)
+ classifier.plot_feature_matrix_interactive(args.plot)
print("plot of training data written to {}".format(args.plot))
frequency_map = {}
diff --git a/feature_selection.py b/feature_selection.py
index 765248c..9250a07 100644
--- a/feature_selection.py
+++ b/feature_selection.py
@@ -103,6 +103,32 @@ def feat_spearman_correlation(block_reward):
).correlation
+def feat_spearman_correlation_lodestar(block_reward):
+ """Spearman correlation coefficient for the per attestation rewards vs their sorted version
+
+ This variant sorts by total_rewards / inclusion distance, which is what Lodestar uses.
+ """
+ per_attestation_rewards = block_reward["attestation_rewards"][
+ "per_attestation_rewards"
+ ]
+ slot = int(block_reward["meta"]["slot"])
+ attestation_data = block_reward["attestation_rewards"].get("attestations") or []
+ inclusion_distances = [int(att["slot"]) - slot for att in attestation_data]
+ attestation_totals = [
+ sum(rewards.values()) / inclusion_distances[i]
+ for i, rewards in enumerate(per_attestation_rewards)
+ ]
+ sorted_attestation_totals = sorted(attestation_totals)
+ # Spearman coefficient isn't defined for uniform/constant sequences, so we just default
+ # that to 1.0
+ if attestation_totals == sorted_attestation_totals:
+ return 1.0
+ else:
+ return scipy.stats.spearmanr(
+ attestation_totals, sorted_attestation_totals
+ ).correlation
+
+
def feat_total_reward(block_reward):
total_reward = block_reward["attestation_rewards"]["total"]
return total_reward
@@ -134,6 +160,17 @@ def feat_median_density(block_reward):
return safe_median(densities)
+def feat_median_density_electra(block_reward):
+ per_attestation_rewards = block_reward["attestation_rewards"][
+ "per_attestation_rewards"
+ ]
+ densities = [
+ len(rewards) / (32 * TARGET_COMMITTEE_SIZE)
+ for rewards in per_attestation_rewards
+ ]
+ return safe_median(densities)
+
+
def feat_mean_density(block_reward):
per_attestation_rewards = block_reward["attestation_rewards"][
"per_attestation_rewards"
@@ -206,11 +243,13 @@ def f(block_reward):
lambda x: (x[0], x[3]), reverse=True
),
"spearman_correlation": feat_spearman_correlation,
+ "spearman_correlation_lodestar": feat_spearman_correlation_lodestar,
"reward": feat_total_reward,
"norm_reward": feat_total_reward_norm,
"norm_reward_per_slot": scale_by_num_slots(feat_total_reward_norm),
"reward_per_attestation": scale_by_num_attestations(feat_total_reward),
"median_density": feat_median_density,
+ "median_density_electra": feat_median_density_electra,
"mean_density": feat_mean_density,
"num_single_bit": feat_num_single_bit,
"percent_single_bit": scale_by_num_attestations(feat_num_single_bit),
diff --git a/prepare_training_data.py b/prepare_training_data.py
index d84871a..8c08b0f 100755
--- a/prepare_training_data.py
+++ b/prepare_training_data.py
@@ -15,7 +15,7 @@
CLIENTS = ["Grandine", "Lighthouse", "Lodestar", "Nimbus", "Other", "Prysm", "Teku"]
REGEX_PATTERNS = {
- "Grandine": [],
+ "Grandine": [r".*[Gg]randine.*"],
"Lighthouse": [r".*[Ll]ighthouse", r"RP-[A-Z]?L v[0-9]*\.[0-9]*\.[0-9]*.*"],
"Teku": [r".*[Tt]eku", r"RP-[A-Z]?T v[0-9]*\.[0-9]*\.[0-9]*.*"],
"Nimbus": [r".*[Nn]imbus", r"RP-[A-Z]?N v[0-9]*\.[0-9]*\.[0-9]*.*"],
@@ -62,6 +62,7 @@ def process_file(
raw_data_dir: str, proc_data_dir: str, disabled_clients: list[str], file_name: str
) -> None:
with open(os.path.join(raw_data_dir, file_name), "r") as f:
+ print(f"Processing {file_name}")
rewards = json.load(f)
res = classify_rewards_by_graffiti(rewards, disabled_clients=disabled_clients)
diff --git a/requirements.txt b/requirements.txt
index 2f8d11c..1427efc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ sseclient-py==1.8.0
gunicorn==21.2.0
matplotlib==3.8.0
scipy==1.11.3
+plotly==6.1.2