diff --git a/classifier.py b/classifier.py index 4252da9..59ff0fb 100755 --- a/classifier.py +++ b/classifier.py @@ -7,6 +7,7 @@ import numpy as np import matplotlib.pyplot as plt import pickle +import plotly.graph_objects as go from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier @@ -26,12 +27,12 @@ DEFAULT_FEATURES = [ "percent_redundant_boost", - "difflib_rewards", "difflib_slot", - "difflib_slot_rev", + "spearman_correlation", + "norm_reward", ] -DEFAULT_GRAFFITI_ONLY = ["Lodestar"] +DEFAULT_GRAFFITI_ONLY = ["Grandine", "Lodestar"] # too hard rn VIABLE_FEATURES = [ "percent_redundant_boost", @@ -183,7 +184,7 @@ def plot_feature_matrix(self, output_path): z = self.feature_matrix[:, 2] scatter = ax.scatter( - x, y, z, c=self.training_labels, marker=".", alpha=0.25, cmap="Set1" + x, y, z, c=self.training_labels, marker=".", alpha=0.25, cmap="Dark2" ) handles, _ = scatter.legend_elements() @@ -204,6 +205,45 @@ def plot_feature_matrix(self, output_path): else: fig.savefig(output_path) + def plot_feature_matrix_interactive(self, output_path): + int_to_client_name = {i: client for (i, client) in enumerate(CLIENTS)} + text = [int_to_client_name[i] for i in self.training_labels] + + fig = go.Figure( + data=go.Scatter3d( + x=self.feature_matrix[:, 0], + y=self.feature_matrix[:, 1], + z=self.feature_matrix[:, 2], + mode="markers", + marker=dict( + size=5, + color=self.training_labels, + colorscale="ylgnbu", + opacity=0.8, + colorbar=dict(labelalias=int_to_client_name), + ), + text=text, # hover text + hovertemplate="%{text}
" + + f"{self.features[0]}: %{{x}}
" + + f"{self.features[1]}: %{{y}}
" + + f"{self.features[2]}: %{{z}}", + ) + ) + + fig.update_layout( + scene=dict( + xaxis_title=self.features[0], + yaxis_title=self.features[1], + zaxis_title=self.features[2], + ), + title="3D Feature Matrix", + ) + + if output_path is None: + fig.show() + else: + fig.write_html(output_path) # Creates interactive HTML file + def compute_guess_list(probability_map, enabled_clients) -> list: guesses = [] @@ -364,7 +404,7 @@ def main(): ) if args.plot is not None: - classifier.plot_feature_matrix(args.plot) + classifier.plot_feature_matrix_interactive(args.plot) print("plot of training data written to {}".format(args.plot)) frequency_map = {} diff --git a/feature_selection.py b/feature_selection.py index 765248c..9250a07 100644 --- a/feature_selection.py +++ b/feature_selection.py @@ -103,6 +103,32 @@ def feat_spearman_correlation(block_reward): ).correlation +def feat_spearman_correlation_lodestar(block_reward): + """Spearman correlation coefficient for the per attestation rewards vs their sorted version + + This variant sorts by total_rewards / inclusion distance, which is what Lodestar uses. + """ + per_attestation_rewards = block_reward["attestation_rewards"][ + "per_attestation_rewards" + ] + slot = int(block_reward["meta"]["slot"]) + attestation_data = block_reward["attestation_rewards"].get("attestations") or [] + inclusion_distances = [int(att["slot"]) - slot for att in attestation_data] + attestation_totals = [ + sum(rewards.values()) / inclusion_distances[i] + for i, rewards in enumerate(per_attestation_rewards) + ] + sorted_attestation_totals = sorted(attestation_totals) + # Spearman coefficient isn't defined for uniform/constant sequences, so we just default + # that to 1.0 + if attestation_totals == sorted_attestation_totals: + return 1.0 + else: + return scipy.stats.spearmanr( + attestation_totals, sorted_attestation_totals + ).correlation + + def feat_total_reward(block_reward): total_reward = block_reward["attestation_rewards"]["total"] return total_reward @@ -134,6 +160,17 @@ def feat_median_density(block_reward): return safe_median(densities) +def feat_median_density_electra(block_reward): + per_attestation_rewards = block_reward["attestation_rewards"][ + "per_attestation_rewards" + ] + densities = [ + len(rewards) / (32 * TARGET_COMMITTEE_SIZE) + for rewards in per_attestation_rewards + ] + return safe_median(densities) + + def feat_mean_density(block_reward): per_attestation_rewards = block_reward["attestation_rewards"][ "per_attestation_rewards" @@ -206,11 +243,13 @@ def f(block_reward): lambda x: (x[0], x[3]), reverse=True ), "spearman_correlation": feat_spearman_correlation, + "spearman_correlation_lodestar": feat_spearman_correlation_lodestar, "reward": feat_total_reward, "norm_reward": feat_total_reward_norm, "norm_reward_per_slot": scale_by_num_slots(feat_total_reward_norm), "reward_per_attestation": scale_by_num_attestations(feat_total_reward), "median_density": feat_median_density, + "median_density_electra": feat_median_density_electra, "mean_density": feat_mean_density, "num_single_bit": feat_num_single_bit, "percent_single_bit": scale_by_num_attestations(feat_num_single_bit), diff --git a/prepare_training_data.py b/prepare_training_data.py index d84871a..8c08b0f 100755 --- a/prepare_training_data.py +++ b/prepare_training_data.py @@ -15,7 +15,7 @@ CLIENTS = ["Grandine", "Lighthouse", "Lodestar", "Nimbus", "Other", "Prysm", "Teku"] REGEX_PATTERNS = { - "Grandine": [], + "Grandine": [r".*[Gg]randine.*"], "Lighthouse": [r".*[Ll]ighthouse", r"RP-[A-Z]?L v[0-9]*\.[0-9]*\.[0-9]*.*"], "Teku": [r".*[Tt]eku", r"RP-[A-Z]?T v[0-9]*\.[0-9]*\.[0-9]*.*"], "Nimbus": [r".*[Nn]imbus", r"RP-[A-Z]?N v[0-9]*\.[0-9]*\.[0-9]*.*"], @@ -62,6 +62,7 @@ def process_file( raw_data_dir: str, proc_data_dir: str, disabled_clients: list[str], file_name: str ) -> None: with open(os.path.join(raw_data_dir, file_name), "r") as f: + print(f"Processing {file_name}") rewards = json.load(f) res = classify_rewards_by_graffiti(rewards, disabled_clients=disabled_clients) diff --git a/requirements.txt b/requirements.txt index 2f8d11c..1427efc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ sseclient-py==1.8.0 gunicorn==21.2.0 matplotlib==3.8.0 scipy==1.11.3 +plotly==6.1.2