From ba5fae8d43dcea731bacc4f3901aacc5ef813365 Mon Sep 17 00:00:00 2001 From: tandonmitul27 Date: Thu, 5 Feb 2026 00:16:03 +0530 Subject: [PATCH 1/3] Added MLP tests --- tests/regression/mlp/Makefile | 14 + tests/regression/mlp/common.h | 38 +++ tests/regression/mlp/kernel.cpp | 162 +++++++++ tests/regression/mlp/main.cpp | 346 +++++++++++++++++++ tests/regression/mlp_vegeta/Makefile | 14 + tests/regression/mlp_vegeta/common.h | 55 +++ tests/regression/mlp_vegeta/kernel.cpp | 220 ++++++++++++ tests/regression/mlp_vegeta/main.cpp | 451 +++++++++++++++++++++++++ 8 files changed, 1300 insertions(+) create mode 100644 tests/regression/mlp/Makefile create mode 100644 tests/regression/mlp/common.h create mode 100644 tests/regression/mlp/kernel.cpp create mode 100644 tests/regression/mlp/main.cpp create mode 100644 tests/regression/mlp_vegeta/Makefile create mode 100644 tests/regression/mlp_vegeta/common.h create mode 100644 tests/regression/mlp_vegeta/kernel.cpp create mode 100644 tests/regression/mlp_vegeta/main.cpp diff --git a/tests/regression/mlp/Makefile b/tests/regression/mlp/Makefile new file mode 100644 index 000000000..d50062bac --- /dev/null +++ b/tests/regression/mlp/Makefile @@ -0,0 +1,14 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mlp + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= + +include ../common.mk diff --git a/tests/regression/mlp/common.h b/tests/regression/mlp/common.h new file mode 100644 index 000000000..7cc37012a --- /dev/null +++ b/tests/regression/mlp/common.h @@ -0,0 +1,38 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef TYPE +#define TYPE float +#endif + +// MLP Network Configuration (all dimensions multiples of 16) +#define INPUT_DIM 128 +#define HIDDEN1_DIM 256 +#define HIDDEN2_DIM 128 +#define HIDDEN3_DIM 64 +#define OUTPUT_DIM 16 + +// Number of layers +#define NUM_LAYERS 4 + +// Layer configuration structure +typedef struct { + uint32_t input_dim; + uint32_t output_dim; + uint64_t weights_addr; // Weight matrix: output_dim x input_dim + uint64_t bias_addr; // Bias vector: output_dim +} layer_config_t; + +// Kernel arguments +typedef struct { + uint32_t num_layers; + uint32_t batch_size; // Number of input samples + uint64_t input_addr; // Input data + uint64_t output_addr; // Final output + uint64_t layer_configs_addr; // Pointer to layer configurations + // Intermediate buffers for layer outputs + uint64_t buffer1_addr; // For layer 1 output / layer 2 input + uint64_t buffer2_addr; // For layer 2 output / layer 3 input +} kernel_arg_t; + +#endif diff --git a/tests/regression/mlp/kernel.cpp b/tests/regression/mlp/kernel.cpp new file mode 100644 index 000000000..50e4ed24e --- /dev/null +++ b/tests/regression/mlp/kernel.cpp @@ -0,0 +1,162 @@ +#include +#include +#include +#include "common.h" + +// This kernel is designed for single-core parallel execution (scales with warps/threads). Multi-core would require inter-core synchronization between layers, which Vortex's vx_spawn_threads doesn't provide yet (to the best of my knowledge)... + +// ReLU activation function +inline TYPE relu(TYPE x) { + return (x > 0) ? x : 0; +} + +// Layer execution arguments passed to spawned threads +typedef struct { + TYPE* input; + TYPE* weights; + TYPE* bias; + TYPE* output; + uint32_t input_dim; + uint32_t output_dim; + bool apply_relu; +} layer_args_t; + + +// Softmax execution arguments +typedef struct { + TYPE* data; + uint32_t size; + TYPE* max_val; + TYPE* sum_exp; +} softmax_args_t; + +// Global shared variables for softmax reductions +TYPE g_softmax_max_val; +TYPE g_softmax_sum_exp; + +// Thread function: thread 0 finds max, others wait +void softmax_find_max_thread(softmax_args_t* __UNIFORM__ args) { + if (blockIdx.x == 0) { + // Thread 0 does the reduction serially + TYPE max_val = args->data[0]; + for (uint32_t i = 1; i < args->size; ++i) { + if (args->data[i] > max_val) { + max_val = args->data[i]; + } + } + *(args->max_val) = max_val; + } +} + +// Thread function: compute exp in parallel, thread 0 sums +void softmax_exp_sum_thread(softmax_args_t* __UNIFORM__ args) { + uint32_t i = blockIdx.x; + + // All threads compute their exp value + if (i < args->size) { + args->data[i] = expf(args->data[i] - *(args->max_val)); + } + + // Thread 0 does the sum reduction serially + if (i == 0) { + TYPE sum_exp = 0.0f; + for (uint32_t j = 0; j < args->size; ++j) { + sum_exp += args->data[j]; + } + *(args->sum_exp) = sum_exp; + } +} + +// Thread function: normalize in parallel +void softmax_normalize_thread(softmax_args_t* __UNIFORM__ args) { + uint32_t i = blockIdx.x; + if (i < args->size) { + args->data[i] /= *(args->sum_exp); + } +} + +// Softmax implementation +void apply_softmax(TYPE* data, uint32_t size) { + softmax_args_t args; + args.data = data; + args.size = size; + args.max_val = &g_softmax_max_val; + args.sum_exp = &g_softmax_sum_exp; + + uint32_t single_thread = 1; + vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_find_max_thread, &args); + + vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_sum_thread, &args); + + vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_normalize_thread, &args); +} + +// Each thread computes one output neuron +// y[i] = sum_j(W[i][j] * x[j]) + b[i] +void fc_layer_thread(layer_args_t* __UNIFORM__ args) { + uint32_t i = blockIdx.x; // Output neuron index + + if (i >= args->output_dim) return; + + TYPE sum = 0; + + // Compute dot product: W[i] · input + for (uint32_t j = 0; j < args->input_dim; ++j) { + sum += args->weights[i * args->input_dim + j] * args->input[j]; + } + + // Add bias + sum += args->bias[i]; + + // Apply ReLU activation (except for output layer) + if (args->apply_relu) { + sum = relu(sum); + } + + args->output[i] = sum; +} + +// MLP inference kernel +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto layer_configs = reinterpret_cast(arg->layer_configs_addr); + + // Layer buffers + auto input = reinterpret_cast(arg->input_addr); + auto buffer1 = reinterpret_cast(arg->buffer1_addr); + auto buffer2 = reinterpret_cast(arg->buffer2_addr); + auto output = reinterpret_cast(arg->output_addr); + + // Pointer arrays for ping-pong buffering + TYPE* layer_inputs[4] = {input, buffer1, buffer2, buffer1}; + TYPE* layer_outputs[4] = {buffer1, buffer2, buffer1, output}; + + layer_args_t layer_args; + + // Execute each layer with parallel threads + for (uint32_t layer = 0; layer < arg->num_layers; ++layer) { + auto cfg = &layer_configs[layer]; + + layer_args.input = layer_inputs[layer]; + layer_args.weights = reinterpret_cast(cfg->weights_addr); + layer_args.bias = reinterpret_cast(cfg->bias_addr); + layer_args.output = layer_outputs[layer]; + layer_args.input_dim = cfg->input_dim; + layer_args.output_dim = cfg->output_dim; + layer_args.apply_relu = (layer < arg->num_layers - 1); + + // vx_spawn_threads provides barrier synchronization after all threads complete + vx_spawn_threads(1, &layer_args.output_dim, nullptr, + (vx_kernel_func_cb)fc_layer_thread, &layer_args); + } + + // Apply softmax to final output + uint32_t output_size = layer_configs[arg->num_layers - 1].output_dim; + apply_softmax(output, output_size); +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + + uint32_t grid_dim = 1; + return vx_spawn_threads(1, &grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mlp/main.cpp b/tests/regression/mlp/main.cpp new file mode 100644 index 000000000..ca1d7373c --- /dev/null +++ b/tests/regression/mlp/main.cpp @@ -0,0 +1,346 @@ +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 150 // Higher tolerance for deep network (4 layers, accumulated FP errors) + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.vxbin"; + +// Device handles +vx_device_h device = nullptr; +vx_buffer_h input_buffer = nullptr; +vx_buffer_h output_buffer = nullptr; +vx_buffer_h buffer1 = nullptr; +vx_buffer_h buffer2 = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +vx_buffer_h layer_configs_buffer = nullptr; + +// Weight and bias buffers for each layer +vx_buffer_h weights_buffers[NUM_LAYERS]; +vx_buffer_h bias_buffers[NUM_LAYERS]; + +kernel_arg_t kernel_arg = {}; + +// Layer dimensions +const uint32_t layer_input_dims[NUM_LAYERS] = {INPUT_DIM, HIDDEN1_DIM, HIDDEN2_DIM, HIDDEN3_DIM}; +const uint32_t layer_output_dims[NUM_LAYERS] = {HIDDEN1_DIM, HIDDEN2_DIM, HIDDEN3_DIM, OUTPUT_DIM}; + +static void show_usage() { + std::cout << "Vortex MLP Test." << std::endl; +// std::cout << "Usage: [-k: kernel] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "k:h")) != -1) { + switch (c) { + case 'k': + kernel_file = optarg; + break; + case 'h': + show_usage(); + exit(0); + break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(input_buffer); + vx_mem_free(output_buffer); + vx_mem_free(buffer1); + vx_mem_free(buffer2); + vx_mem_free(layer_configs_buffer); + for (int i = 0; i < NUM_LAYERS; ++i) { + vx_mem_free(weights_buffers[i]); + vx_mem_free(bias_buffers[i]); + } + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +// Initialize weights with Xavier initialization +void init_weights(std::vector& weights, uint32_t fan_in, uint32_t fan_out) { + TYPE scale = std::sqrt(2.0f / (fan_in + fan_out)); + for (size_t i = 0; i < weights.size(); ++i) { + // Generate random value between -scale and scale + weights[i] = (static_cast(rand()) / RAND_MAX * 2.0f - 1.0f) * scale; + } +} + +// Initialize biases to zero +void init_bias(std::vector& bias) { + for (size_t i = 0; i < bias.size(); ++i) { + bias[i] = 0.0f; + } +} + +// CPU reference implementation of ReLU +TYPE relu_cpu(TYPE x) { + return (x > 0) ? x : 0; +} + +// CPU reference implementation of fully connected layer +void fc_layer_cpu(const TYPE* input, const TYPE* weights, const TYPE* bias, + TYPE* output, uint32_t input_dim, uint32_t output_dim, bool apply_relu) { + for (uint32_t i = 0; i < output_dim; ++i) { + TYPE sum = 0; + for (uint32_t j = 0; j < input_dim; ++j) { + sum += weights[i * input_dim + j] * input[j]; + } + sum += bias[i]; + output[i] = apply_relu ? relu_cpu(sum) : sum; + } +} + +// CPU reference implementation of softmax +void softmax_cpu(TYPE* data, uint32_t size) { + // Find max for numerical stability + TYPE max_val = data[0]; + for (uint32_t i = 1; i < size; ++i) { + if (data[i] > max_val) { + max_val = data[i]; + } + } + + // Compute exp(x - max) and sum + TYPE sum_exp = 0.0f; + for (uint32_t i = 0; i < size; ++i) { + data[i] = std::exp(data[i] - max_val); + sum_exp += data[i]; + } + + // Normalize + for (uint32_t i = 0; i < size; ++i) { + data[i] /= sum_exp; + } +} + +// Run MLP inference on CPU for reference +void mlp_cpu(const TYPE* input, TYPE* output, + const std::vector>& weights, + const std::vector>& biases) { + + std::vector layer_output[NUM_LAYERS]; + + for (int l = 0; l < NUM_LAYERS; ++l) { + layer_output[l].resize(layer_output_dims[l]); + + const TYPE* layer_input = (l == 0) ? input : layer_output[l-1].data(); + bool apply_relu = (l < NUM_LAYERS - 1); + + fc_layer_cpu(layer_input, weights[l].data(), biases[l].data(), + layer_output[l].data(), layer_input_dims[l], layer_output_dims[l], apply_relu); + } + + // Copy final output + memcpy(output, layer_output[NUM_LAYERS-1].data(), OUTPUT_DIM * sizeof(TYPE)); + + // Apply softmax to final output + softmax_cpu(output, OUTPUT_DIM); +} + +// Compare floating point values with ULP tolerance +bool compare_float(TYPE a, TYPE b, int index, int& errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f (diff=%d ULP)\n", index, b, a, d); + } + return false; + } + return true; +} + +int main(int argc, char *argv[]) { + parse_args(argc, argv); + + std::srand(42); // Fixed seed for reproducible results across config changes + + std::cout << "=== Vortex MLP Neural Network Test ===" << std::endl; + std::cout << "Network: " << INPUT_DIM << " -> " << HIDDEN1_DIM + << " -> " << HIDDEN2_DIM << " -> " << HIDDEN3_DIM + << " -> " << OUTPUT_DIM << std::endl; + + // Open device connection + std::cout << "Opening device connection..." << std::endl; + RT_CHECK(vx_dev_open(&device)); + + // Allocate host buffers for weights and biases + std::vector> h_weights(NUM_LAYERS); + std::vector> h_biases(NUM_LAYERS); + + for (int l = 0; l < NUM_LAYERS; ++l) { + uint32_t in_dim = layer_input_dims[l]; + uint32_t out_dim = layer_output_dims[l]; + + h_weights[l].resize(out_dim * in_dim); + h_biases[l].resize(out_dim); + + init_weights(h_weights[l], in_dim, out_dim); + init_bias(h_biases[l]); + } + + // Allocate host input/output buffers + std::vector h_input(INPUT_DIM); + std::vector h_output(OUTPUT_DIM); + std::vector h_ref_output(OUTPUT_DIM); + + // Initialize input with random values + for (uint32_t i = 0; i < INPUT_DIM; ++i) { + h_input[i] = static_cast(rand()) / RAND_MAX; + } + + // Print input summary + // std::cout << "Input (first 8 values): "; + // for (int i = 0; i < 8; ++i) { + // std::cout << h_input[i] << " "; + // } + // std::cout << "..." << std::endl; + + // Allocate device memory for input/output + std::cout << "Allocating device memory..." << std::endl; + + RT_CHECK(vx_mem_alloc(device, INPUT_DIM * sizeof(TYPE), VX_MEM_READ, &input_buffer)); + RT_CHECK(vx_mem_address(input_buffer, &kernel_arg.input_addr)); + + RT_CHECK(vx_mem_alloc(device, OUTPUT_DIM * sizeof(TYPE), VX_MEM_READ_WRITE, &output_buffer)); + RT_CHECK(vx_mem_address(output_buffer, &kernel_arg.output_addr)); + + // Intermediate buffers (sized for largest hidden layer) + uint32_t max_hidden = std::max(HIDDEN1_DIM, std::max(HIDDEN2_DIM, HIDDEN3_DIM)); + RT_CHECK(vx_mem_alloc(device, max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer1)); + RT_CHECK(vx_mem_address(buffer1, &kernel_arg.buffer1_addr)); + + RT_CHECK(vx_mem_alloc(device, max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer2)); + RT_CHECK(vx_mem_address(buffer2, &kernel_arg.buffer2_addr)); + + // Allocate and upload weights and biases + std::vector layer_configs(NUM_LAYERS); + + for (int l = 0; l < NUM_LAYERS; ++l) { + uint32_t in_dim = layer_input_dims[l]; + uint32_t out_dim = layer_output_dims[l]; + + // Allocate weights + RT_CHECK(vx_mem_alloc(device, out_dim * in_dim * sizeof(TYPE), VX_MEM_READ, &weights_buffers[l])); + RT_CHECK(vx_mem_address(weights_buffers[l], &layer_configs[l].weights_addr)); + RT_CHECK(vx_copy_to_dev(weights_buffers[l], h_weights[l].data(), 0, out_dim * in_dim * sizeof(TYPE))); + + // Allocate biases + RT_CHECK(vx_mem_alloc(device, out_dim * sizeof(TYPE), VX_MEM_READ, &bias_buffers[l])); + RT_CHECK(vx_mem_address(bias_buffers[l], &layer_configs[l].bias_addr)); + RT_CHECK(vx_copy_to_dev(bias_buffers[l], h_biases[l].data(), 0, out_dim * sizeof(TYPE))); + + layer_configs[l].input_dim = in_dim; + layer_configs[l].output_dim = out_dim; + + std::cout << " Layer " << l << ": " << in_dim << " x " << out_dim + << " (weights: " << (out_dim * in_dim * sizeof(TYPE)) << " bytes)" << std::endl; + } + + // Upload layer configs + RT_CHECK(vx_mem_alloc(device, NUM_LAYERS * sizeof(layer_config_t), VX_MEM_READ, &layer_configs_buffer)); + RT_CHECK(vx_mem_address(layer_configs_buffer, &kernel_arg.layer_configs_addr)); + RT_CHECK(vx_copy_to_dev(layer_configs_buffer, layer_configs.data(), 0, NUM_LAYERS * sizeof(layer_config_t))); + + // Upload input + std::cout << "Uploading input data..." << std::endl; + RT_CHECK(vx_copy_to_dev(input_buffer, h_input.data(), 0, INPUT_DIM * sizeof(TYPE))); + + // Set kernel arguments + kernel_arg.num_layers = NUM_LAYERS; + kernel_arg.batch_size = 1; + + // Upload kernel binary + std::cout << "Uploading kernel binary..." << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + + // Upload kernel arguments + std::cout << "Uploading kernel arguments..." << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer)); + + // Start device + std::cout << "Starting MLP inference on device..." << std::endl; + auto time_start = std::chrono::high_resolution_clock::now(); + + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + + // Wait for completion + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + std::cout << "Elapsed time: " << elapsed << " ms" << std::endl; + + // Download output + std::cout << "Downloading output..." << std::endl; + RT_CHECK(vx_copy_from_dev(h_output.data(), output_buffer, 0, OUTPUT_DIM * sizeof(TYPE))); + + // Compute CPU reference + std::cout << "Computing CPU reference..." << std::endl; + mlp_cpu(h_input.data(), h_ref_output.data(), h_weights, h_biases); + + // Print outputs + // std::cout << "Device output: "; + // for (int i = 0; i < OUTPUT_DIM; ++i) { + // std::cout << h_output[i] << " "; + // } + // std::cout << std::endl; + + // std::cout << "CPU reference: "; + // for (int i = 0; i < OUTPUT_DIM; ++i) { + // std::cout << h_ref_output[i] << " "; + // } + // std::cout << std::endl; + + // Verify results + std::cout << "Verifying results..." << std::endl; + int errors = 0; + for (uint32_t i = 0; i < OUTPUT_DIM; ++i) { + if (!compare_float(h_output[i], h_ref_output[i], i, errors)) { + ++errors; + } + } + + // Cleanup + std::cout << "Cleaning up..." << std::endl; + cleanup(); + + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + + std::cout << "PASSED!" << std::endl; + return 0; +} diff --git a/tests/regression/mlp_vegeta/Makefile b/tests/regression/mlp_vegeta/Makefile new file mode 100644 index 000000000..b2b36e7cd --- /dev/null +++ b/tests/regression/mlp_vegeta/Makefile @@ -0,0 +1,14 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := mlp_vegeta + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= + +include ../common.mk diff --git a/tests/regression/mlp_vegeta/common.h b/tests/regression/mlp_vegeta/common.h new file mode 100644 index 000000000..ebbf9c6d3 --- /dev/null +++ b/tests/regression/mlp_vegeta/common.h @@ -0,0 +1,55 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#include + +#ifndef TYPE +#define TYPE float +#endif + +// MLP Network Configuration (same as mlp_test, all multiples of 16) +#define INPUT_DIM 128 +#define HIDDEN1_DIM 256 +#define HIDDEN2_DIM 128 +#define HIDDEN3_DIM 64 +#define OUTPUT_DIM 16 + +// Number of layers +#define NUM_LAYERS 4 + +// VEGETA tile dimensions +#define TILE_SIZE 16 +#define BATCH_SIZE 16 // VEGETA requires batch=16 for proper 16x16 tiles +#define T_TILE_BYTES (TILE_SIZE * TILE_SIZE * sizeof(TYPE)) // 1KB +#define U_TILE_BYTES (2 * T_TILE_BYTES) // 2KB +#define V_TILE_BYTES (4 * T_TILE_BYTES) // 4KB +#define M_TILE_BYTES (TILE_SIZE * TILE_SIZE / 2) // 128 bytes metadata + +// GEMM modes for VEGETA +typedef enum { + MLP_MODE_TGEMM = 0, // Dense × Dense (T × T) + MLP_MODE_UGEMM = 1, // Dense × 2:4 Sparse (T × U) + MLP_MODE_VGEMM = 2 // Dense × 1:4 Sparse (T × V) +} mlp_mode_t; + +// Layer configuration structure +typedef struct { + uint32_t input_dim; + uint32_t output_dim; + uint64_t weights_addr; // Weight matrix: input_dim x output_dim (row-major) + uint64_t bias_addr; // Bias vector: output_dim + uint64_t metadata_addr; // Sparsity metadata (for UGEMM/VGEMM modes) +} layer_config_t; + +// Kernel arguments +typedef struct { + uint32_t num_layers; + uint32_t mode; // MLP_MODE_TGEMM, MLP_MODE_UGEMM, or MLP_MODE_VGEMM + uint64_t input_addr; // Input data: [BATCH_SIZE x INPUT_DIM] + uint64_t output_addr; // Final output: [BATCH_SIZE x OUTPUT_DIM] + uint64_t layer_configs_addr; // Pointer to layer configurations + uint64_t buffer1_addr; // Intermediate buffer 1 + uint64_t buffer2_addr; // Intermediate buffer 2 +} kernel_arg_t; + +#endif diff --git a/tests/regression/mlp_vegeta/kernel.cpp b/tests/regression/mlp_vegeta/kernel.cpp new file mode 100644 index 000000000..ad57212f0 --- /dev/null +++ b/tests/regression/mlp_vegeta/kernel.cpp @@ -0,0 +1,220 @@ +#include +#include +#include +#include "common.h" + +// Aligned tile buffer for intermediate results +static TYPE g_C_tile[TILE_SIZE * TILE_SIZE] __attribute__((aligned(64))); + +// ReLU activation (in-place) for batched data +inline void apply_relu_batch(TYPE* data, uint32_t batch_size, uint32_t dim) { + for (uint32_t b = 0; b < batch_size; ++b) { + for (uint32_t i = 0; i < dim; ++i) { + TYPE& v = data[b * dim + i]; + if (v < 0) v = 0; + } + } +} + +// Softmax for each sample in batch +void apply_softmax_batch(TYPE* data, uint32_t batch_size, uint32_t dim) { + for (uint32_t b = 0; b < batch_size; ++b) { + TYPE* sample = data + b * dim; + + TYPE max_val = sample[0]; + for (uint32_t i = 1; i < dim; ++i) { + if (sample[i] > max_val) max_val = sample[i]; + } + + TYPE sum_exp = 0.0f; + for (uint32_t i = 0; i < dim; ++i) { + sample[i] = expf(sample[i] - max_val); + sum_exp += sample[i]; + } + + for (uint32_t i = 0; i < dim; ++i) { + sample[i] /= sum_exp; + } + } +} + +// Initialize C tile buffer to zeros +inline void clear_C_tile() { + for (uint32_t i = 0; i < TILE_SIZE * TILE_SIZE; ++i) { + g_C_tile[i] = 0.0f; + } +} + +// ============================================================================= +// TGEMM: Dense × Dense (T × T → T) +// ============================================================================= +void vegeta_tgemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, TYPE* C_tile) { + vx_lt(0, (size_t)C_tile, 0); // Load C to T0 (accumulator) + vx_lt(1, (size_t)A_tile, 0); // Load A to T1 + vx_lt(2, (size_t)B_tile, 0); // Load B to T2 + vx_tgemm(0, 1, 2); // T0 += T1 × T2 + vx_st((size_t)C_tile, 0, 0); // Store result +} + +void vegeta_layer_tgemm_batch( + TYPE* input, TYPE* weights, TYPE* bias, TYPE* output, + uint32_t in_dim, uint32_t out_dim +) { + uint32_t in_tiles = in_dim / TILE_SIZE; + uint32_t out_tiles = out_dim / TILE_SIZE; + + for (uint32_t out_t = 0; out_t < out_tiles; ++out_t) { + clear_C_tile(); + + for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { + TYPE* A_tile = input + in_t * TILE_SIZE; + TYPE* B_tile = weights + in_t * TILE_SIZE * out_dim + out_t * TILE_SIZE; + vegeta_tgemm_tile_accumulate(A_tile, B_tile, g_C_tile); + } + + for (uint32_t row = 0; row < BATCH_SIZE; ++row) { + for (uint32_t col = 0; col < TILE_SIZE; ++col) { + output[row * out_dim + out_t * TILE_SIZE + col] = + g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; + } + } + } +} + +// ============================================================================= +// UGEMM: Dense × 2:4 Sparse (T × U → T) +// For 2:4 sparsity: weights are compressed to half size, metadata indicates positions +// ============================================================================= +void vegeta_ugemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) { + vx_lt(0, (size_t)C_tile, 0); // Load C to T0 (accumulator) + vx_lt(1, (size_t)A_tile, 0); // Load A to T1 + vx_lm(1, (size_t)M_tile, 0); // Load metadata to M1 + vx_lu(2, (size_t)B_tile, 0); // Load B (2KB) to U2 + vx_ugemm(0, 1, 2); // T0 += T1 × U2 (with M1 metadata) + vx_st((size_t)C_tile, 0, 0); // Store result +} + +void vegeta_layer_ugemm_batch( + TYPE* input, TYPE* weights, uint8_t* metadata, TYPE* bias, TYPE* output, + uint32_t in_dim, uint32_t out_dim +) { + uint32_t in_tiles = in_dim / TILE_SIZE; + uint32_t out_tiles = out_dim / TILE_SIZE; + + for (uint32_t out_t = 0; out_t < out_tiles; ++out_t) { + clear_C_tile(); + + for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { + TYPE* A_tile = input + in_t * TILE_SIZE; + // Compressed weights: half the K dimension + TYPE* B_tile = weights + (in_t * TILE_SIZE / 2) * out_dim + out_t * TILE_SIZE; + // Metadata: 128 bytes per tile + uint8_t* M_tile = metadata + (in_t * out_tiles + out_t) * M_TILE_BYTES; + vegeta_ugemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile); + } + + for (uint32_t row = 0; row < BATCH_SIZE; ++row) { + for (uint32_t col = 0; col < TILE_SIZE; ++col) { + output[row * out_dim + out_t * TILE_SIZE + col] = + g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; + } + } + } +} + +// ============================================================================= +// VGEMM: Dense × 1:4 Sparse (T × V → T) +// For 1:4 sparsity: weights are compressed to quarter size, metadata indicates positions +// ============================================================================= +void vegeta_vgemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) { + vx_lt(0, (size_t)C_tile, 0); // Load C to T0 (accumulator) + vx_lt(1, (size_t)A_tile, 0); // Load A to T1 + vx_lm(1, (size_t)M_tile, 0); // Load metadata to M1 + vx_lv(1, (size_t)B_tile, 0); // Load B (4KB) to V1 + vx_vgemm(0, 1, 1); // T0 += T1 × V1 (with M1 metadata) + vx_st((size_t)C_tile, 0, 0); // Store result +} + +void vegeta_layer_vgemm_batch( + TYPE* input, TYPE* weights, uint8_t* metadata, TYPE* bias, TYPE* output, + uint32_t in_dim, uint32_t out_dim +) { + uint32_t in_tiles = in_dim / TILE_SIZE; + uint32_t out_tiles = out_dim / TILE_SIZE; + + for (uint32_t out_t = 0; out_t < out_tiles; ++out_t) { + clear_C_tile(); + + for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { + TYPE* A_tile = input + in_t * TILE_SIZE; + // Compressed weights: quarter the K dimension + TYPE* B_tile = weights + (in_t * TILE_SIZE / 4) * out_dim + out_t * TILE_SIZE; + // Metadata: 128 bytes per tile + uint8_t* M_tile = metadata + (in_t * out_tiles + out_t) * M_TILE_BYTES; + vegeta_vgemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile); + } + + for (uint32_t row = 0; row < BATCH_SIZE; ++row) { + for (uint32_t col = 0; col < TILE_SIZE; ++col) { + output[row * out_dim + out_t * TILE_SIZE + col] = + g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; + } + } + } +} + +// ============================================================================= +// Kernel Entry Point +// ============================================================================= +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto input = reinterpret_cast(arg->input_addr); + auto output = reinterpret_cast(arg->output_addr); + auto layer_configs = reinterpret_cast(arg->layer_configs_addr); + auto buffer1 = reinterpret_cast(arg->buffer1_addr); + auto buffer2 = reinterpret_cast(arg->buffer2_addr); + uint32_t mode = arg->mode; + + TYPE* layer_input = input; + TYPE* layer_output = buffer1; + + for (uint32_t layer = 0; layer < arg->num_layers; ++layer) { + auto& config = layer_configs[layer]; + auto weights = reinterpret_cast(config.weights_addr); + auto bias = reinterpret_cast(config.bias_addr); + auto metadata = reinterpret_cast(config.metadata_addr); + + if (layer == arg->num_layers - 1) { + layer_output = output; + } else if (layer % 2 == 0) { + layer_output = buffer1; + } else { + layer_output = buffer2; + } + + // Execute layer based on mode + if (mode == MLP_MODE_TGEMM) { + vegeta_layer_tgemm_batch(layer_input, weights, bias, layer_output, + config.input_dim, config.output_dim); + } else if (mode == MLP_MODE_UGEMM) { + vegeta_layer_ugemm_batch(layer_input, weights, metadata, bias, layer_output, + config.input_dim, config.output_dim); + } else if (mode == MLP_MODE_VGEMM) { + vegeta_layer_vgemm_batch(layer_input, weights, metadata, bias, layer_output, + config.input_dim, config.output_dim); + } + + // Apply ReLU for hidden layers + if (layer < arg->num_layers - 1) { + apply_relu_batch(layer_output, BATCH_SIZE, config.output_dim); + } + + layer_input = layer_output; + } + + apply_softmax_batch(output, BATCH_SIZE, OUTPUT_DIM); +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(1, nullptr, nullptr, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/mlp_vegeta/main.cpp b/tests/regression/mlp_vegeta/main.cpp new file mode 100644 index 000000000..a801b854c --- /dev/null +++ b/tests/regression/mlp_vegeta/main.cpp @@ -0,0 +1,451 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.vxbin"; + +vx_device_h device = nullptr; +vx_buffer_h input_buffer = nullptr; +vx_buffer_h output_buffer = nullptr; +vx_buffer_h layer_configs_buffer = nullptr; +vx_buffer_h buffer1 = nullptr; +vx_buffer_h buffer2 = nullptr; +std::vector weight_buffers; +std::vector bias_buffers; +std::vector metadata_buffers; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; + +static mlp_mode_t mlp_mode = MLP_MODE_TGEMM; + +static void show_usage() { + std::cout << "Vortex VEGETA MLP Test (Batched)." << std::endl; +// std::cout << "Usage: [-m mode] [-h: help]" << std::endl; +// std::cout << " -m mode: GEMM mode (0=TGEMM, 1=UGEMM, 2=VGEMM) [default: 0]" << std::endl; +// std::cout << " TGEMM (0): Dense × Dense" << std::endl; +// std::cout << " UGEMM (1): Dense × 2:4 Sparse" << std::endl; +// std::cout << " VGEMM (2): Dense × 1:4 Sparse" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "m:h")) != -1) { + switch (c) { + case 'm': + mlp_mode = static_cast(atoi(optarg)); + if (mlp_mode < MLP_MODE_TGEMM || mlp_mode > MLP_MODE_VGEMM) { + std::cerr << "Error: Invalid mode " << mlp_mode << std::endl; + show_usage(); + exit(-1); + } + break; + case 'h': + show_usage(); + exit(0); + break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(input_buffer); + vx_mem_free(output_buffer); + vx_mem_free(layer_configs_buffer); + vx_mem_free(buffer1); + vx_mem_free(buffer2); + for (auto& buf : weight_buffers) vx_mem_free(buf); + for (auto& buf : bias_buffers) vx_mem_free(buf); + for (auto& buf : metadata_buffers) vx_mem_free(buf); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Sparse Weight Compression +/////////////////////////////////////////////////////////////////////////////// + +// Compress weights to 2:4 sparsity (keep 2 largest of every 4) +// Input: dense weights [in_dim x out_dim] +// Output: compressed weights [in_dim/2 x out_dim], metadata +void compress_2_4_sparse(const std::vector& dense, int in_dim, int out_dim, + std::vector& compressed, std::vector& metadata, + std::vector& logical) { + // For the CPU reference, we need the "logical" dense weights with zeros + logical.resize(in_dim * out_dim, 0.0f); + compressed.resize((in_dim / 2) * out_dim); + + // Metadata: 128 bytes per 16x16 tile + int in_tiles = in_dim / TILE_SIZE; + int out_tiles = out_dim / TILE_SIZE; + metadata.resize(in_tiles * out_tiles * M_TILE_BYTES, 0); + + // For each group of 4 consecutive values in K dimension, keep 2 largest + for (int j = 0; j < out_dim; ++j) { + int comp_idx = 0; + for (int i = 0; i < in_dim; i += 4) { + // Get 4 values + std::pair vals[4]; + for (int k = 0; k < 4; ++k) { + vals[k] = {std::abs(dense[(i + k) * out_dim + j]), k}; + } + // Sort by magnitude (descending) + std::sort(vals, vals + 4, [](auto& a, auto& b) { return a.first > b.first; }); + + // Keep top 2, generate mask + uint8_t mask = 0; + for (int k = 0; k < 2; ++k) { + int pos = vals[k].second; + mask |= (1 << pos); + compressed[(comp_idx / 2) * out_dim + j] = dense[(i + pos) * out_dim + j]; + logical[(i + pos) * out_dim + j] = dense[(i + pos) * out_dim + j]; + comp_idx++; + } + + // Store metadata (simplified - in real impl would be tile-based) + int tile_in = i / TILE_SIZE; + int tile_out = j / TILE_SIZE; + int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES; + int row_in_tile = (i % TILE_SIZE) / 4; + int col_in_tile = j % TILE_SIZE; + int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile; + if (byte_offset < (int)metadata.size()) { + metadata[byte_offset] = mask; + } + } + } +} + +// Compress weights to 1:4 sparsity (keep 1 largest of every 4) +void compress_1_4_sparse(const std::vector& dense, int in_dim, int out_dim, + std::vector& compressed, std::vector& metadata, + std::vector& logical) { + logical.resize(in_dim * out_dim, 0.0f); + compressed.resize((in_dim / 4) * out_dim); + + int in_tiles = in_dim / TILE_SIZE; + int out_tiles = out_dim / TILE_SIZE; + metadata.resize(in_tiles * out_tiles * M_TILE_BYTES, 0); + + for (int j = 0; j < out_dim; ++j) { + int comp_idx = 0; + for (int i = 0; i < in_dim; i += 4) { + // Find largest of 4 + int max_pos = 0; + float max_val = std::abs(dense[i * out_dim + j]); + for (int k = 1; k < 4; ++k) { + float val = std::abs(dense[(i + k) * out_dim + j]); + if (val > max_val) { + max_val = val; + max_pos = k; + } + } + + compressed[comp_idx * out_dim + j] = dense[(i + max_pos) * out_dim + j]; + logical[(i + max_pos) * out_dim + j] = dense[(i + max_pos) * out_dim + j]; + comp_idx++; + + // Store metadata + uint8_t mask = (1 << max_pos); + int tile_in = i / TILE_SIZE; + int tile_out = j / TILE_SIZE; + int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES; + int row_in_tile = (i % TILE_SIZE) / 4; + int col_in_tile = j % TILE_SIZE; + int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile; + if (byte_offset < (int)metadata.size()) { + metadata[byte_offset] = mask; + } + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +// CPU Reference +/////////////////////////////////////////////////////////////////////////////// + +static inline TYPE relu_cpu(TYPE x) { + return (x > 0) ? x : 0; +} + +static void softmax_cpu(TYPE* sample, int dim) { + TYPE max_val = sample[0]; + for (int i = 1; i < dim; ++i) { + if (sample[i] > max_val) max_val = sample[i]; + } + TYPE sum_exp = 0.0f; + for (int i = 0; i < dim; ++i) { + sample[i] = expf(sample[i] - max_val); + sum_exp += sample[i]; + } + for (int i = 0; i < dim; ++i) { + sample[i] /= sum_exp; + } +} + +static void layer_cpu_batch(const TYPE* input, const TYPE* weights, const TYPE* bias, + TYPE* output, int batch_size, int input_dim, int output_dim) { + for (int b = 0; b < batch_size; ++b) { + for (int j = 0; j < output_dim; ++j) { + TYPE sum = bias[j]; + for (int i = 0; i < input_dim; ++i) { + sum += input[b * input_dim + i] * weights[i * output_dim + j]; + } + output[b * output_dim + j] = sum; + } + } +} + +static void mlp_cpu_batch(const std::vector& input, + const std::vector>& weights, + const std::vector>& biases, + const std::vector>& dims, + std::vector& output) { + std::vector layer_input = input; + std::vector layer_output; + + for (size_t layer = 0; layer < dims.size(); ++layer) { + int in_dim = dims[layer].first; + int out_dim = dims[layer].second; + layer_output.resize(BATCH_SIZE * out_dim); + + layer_cpu_batch(layer_input.data(), weights[layer].data(), + biases[layer].data(), layer_output.data(), + BATCH_SIZE, in_dim, out_dim); + + if (layer < dims.size() - 1) { + for (auto& v : layer_output) { + v = relu_cpu(v); + } + } + + layer_input = layer_output; + } + + output = layer_output; + for (int b = 0; b < BATCH_SIZE; ++b) { + softmax_cpu(&output[b * OUTPUT_DIM], OUTPUT_DIM); + } +} + +// Compare floats with relative tolerance +static bool compare_float(TYPE a, TYPE b, int batch, int idx, int& errors) { + constexpr float REL_TOL = 0.05f; // 5% relative tolerance + constexpr float ABS_TOL = 1e-6f; + + float diff = std::abs(a - b); + float max_val = std::max(std::abs(a), std::abs(b)); + + if (diff > REL_TOL * max_val + ABS_TOL) { + if (errors < 10) { + printf("*** error: [batch=%d, idx=%d] expected=%.6f, actual=%.6f (diff=%.2f%%)\n", + batch, idx, b, a, 100.0f * diff / max_val); + } + ++errors; + return false; + } + return true; +} + +/////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char *argv[]) { + parse_args(argc, argv); + + std::srand(50); + + std::vector> layer_dims = { + {INPUT_DIM, HIDDEN1_DIM}, + {HIDDEN1_DIM, HIDDEN2_DIM}, + {HIDDEN2_DIM, HIDDEN3_DIM}, + {HIDDEN3_DIM, OUTPUT_DIM} + }; + + const char* mode_name; + switch (mlp_mode) { + case MLP_MODE_TGEMM: mode_name = "TGEMM (Dense × Dense)"; break; + case MLP_MODE_UGEMM: mode_name = "UGEMM (Dense × 2:4 Sparse)"; break; + case MLP_MODE_VGEMM: mode_name = "VGEMM (Dense × 1:4 Sparse)"; break; + default: mode_name = "Unknown"; + } + + std::cout << "=== Vortex VEGETA MLP Neural Network Test (Batched) ===" << std::endl; + std::cout << "Network: " << INPUT_DIM << " -> " << HIDDEN1_DIM << " -> " + << HIDDEN2_DIM << " -> " << HIDDEN3_DIM << " -> " << OUTPUT_DIM << std::endl; + std::cout << "Mode: " << mode_name << std::endl; + std::cout << "Batch size: " << BATCH_SIZE << std::endl; + + std::cout << "Opening device connection..." << std::endl; + RT_CHECK(vx_dev_open(&device)); + + // Generate batched input + std::vector h_input(BATCH_SIZE * INPUT_DIM); + for (auto& v : h_input) { + v = static_cast(rand()) / RAND_MAX; + } + + // Generate weights, biases, and optional sparse compression + std::vector> h_weights_dense(NUM_LAYERS); + std::vector> h_weights_compressed(NUM_LAYERS); + std::vector> h_weights_logical(NUM_LAYERS); + std::vector> h_biases(NUM_LAYERS); + std::vector> h_metadata(NUM_LAYERS); + + for (int layer = 0; layer < NUM_LAYERS; ++layer) { + int in_dim = layer_dims[layer].first; + int out_dim = layer_dims[layer].second; + + // Generate dense weights + h_weights_dense[layer].resize(in_dim * out_dim); + for (auto& v : h_weights_dense[layer]) { + v = (static_cast(rand()) / RAND_MAX - 0.5f) * 0.1f; + } + + h_biases[layer].resize(out_dim); + for (auto& v : h_biases[layer]) { + v = (static_cast(rand()) / RAND_MAX - 0.5f) * 0.1f; + } + + // Compress weights for sparse modes + if (mlp_mode == MLP_MODE_UGEMM) { + compress_2_4_sparse(h_weights_dense[layer], in_dim, out_dim, + h_weights_compressed[layer], h_metadata[layer], + h_weights_logical[layer]); + } else if (mlp_mode == MLP_MODE_VGEMM) { + compress_1_4_sparse(h_weights_dense[layer], in_dim, out_dim, + h_weights_compressed[layer], h_metadata[layer], + h_weights_logical[layer]); + } else { + h_weights_logical[layer] = h_weights_dense[layer]; + } + } + + // Allocate device memory + std::cout << "Allocating device memory..." << std::endl; + + RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * INPUT_DIM * sizeof(TYPE), VX_MEM_READ, &input_buffer)); + RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE), VX_MEM_READ_WRITE, &output_buffer)); + + int max_hidden = std::max({HIDDEN1_DIM, HIDDEN2_DIM, HIDDEN3_DIM}); + RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer1)); + RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer2)); + + std::vector layer_configs(NUM_LAYERS); + weight_buffers.resize(NUM_LAYERS); + bias_buffers.resize(NUM_LAYERS); + metadata_buffers.resize(NUM_LAYERS, nullptr); + + for (int layer = 0; layer < NUM_LAYERS; ++layer) { + int in_dim = layer_dims[layer].first; + int out_dim = layer_dims[layer].second; + + layer_configs[layer].input_dim = in_dim; + layer_configs[layer].output_dim = out_dim; + + // Upload weights (compressed for sparse modes, dense for TGEMM) + std::vector& weights_to_upload = + (mlp_mode == MLP_MODE_TGEMM) ? h_weights_dense[layer] : h_weights_compressed[layer]; + + size_t weight_size = weights_to_upload.size() * sizeof(TYPE); + RT_CHECK(vx_mem_alloc(device, weight_size, VX_MEM_READ, &weight_buffers[layer])); + RT_CHECK(vx_mem_address(weight_buffers[layer], &layer_configs[layer].weights_addr)); + RT_CHECK(vx_copy_to_dev(weight_buffers[layer], weights_to_upload.data(), 0, weight_size)); + + // Bias + size_t bias_size = h_biases[layer].size() * sizeof(TYPE); + RT_CHECK(vx_mem_alloc(device, bias_size, VX_MEM_READ, &bias_buffers[layer])); + RT_CHECK(vx_mem_address(bias_buffers[layer], &layer_configs[layer].bias_addr)); + RT_CHECK(vx_copy_to_dev(bias_buffers[layer], h_biases[layer].data(), 0, bias_size)); + + // Metadata for sparse modes + if (mlp_mode != MLP_MODE_TGEMM && !h_metadata[layer].empty()) { + size_t meta_size = h_metadata[layer].size(); + RT_CHECK(vx_mem_alloc(device, meta_size, VX_MEM_READ, &metadata_buffers[layer])); + RT_CHECK(vx_mem_address(metadata_buffers[layer], &layer_configs[layer].metadata_addr)); + RT_CHECK(vx_copy_to_dev(metadata_buffers[layer], h_metadata[layer].data(), 0, meta_size)); + } else { + layer_configs[layer].metadata_addr = 0; + } + } + + RT_CHECK(vx_mem_alloc(device, layer_configs.size() * sizeof(layer_config_t), VX_MEM_READ, &layer_configs_buffer)); + RT_CHECK(vx_copy_to_dev(layer_configs_buffer, layer_configs.data(), 0, layer_configs.size() * sizeof(layer_config_t))); + + std::cout << "Uploading input data (" << BATCH_SIZE << " samples)..." << std::endl; + RT_CHECK(vx_copy_to_dev(input_buffer, h_input.data(), 0, BATCH_SIZE * INPUT_DIM * sizeof(TYPE))); + + std::cout << "Uploading kernel binary..." << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + + kernel_arg_t kernel_arg = {}; + kernel_arg.num_layers = NUM_LAYERS; + kernel_arg.mode = mlp_mode; + RT_CHECK(vx_mem_address(input_buffer, &kernel_arg.input_addr)); + RT_CHECK(vx_mem_address(output_buffer, &kernel_arg.output_addr)); + RT_CHECK(vx_mem_address(layer_configs_buffer, &kernel_arg.layer_configs_addr)); + RT_CHECK(vx_mem_address(buffer1, &kernel_arg.buffer1_addr)); + RT_CHECK(vx_mem_address(buffer2, &kernel_arg.buffer2_addr)); + + std::cout << "Uploading kernel arguments..." << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer)); + + std::cout << "Starting VEGETA MLP inference on device..." << std::endl; + auto start = std::chrono::high_resolution_clock::now(); + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + std::cout << "Elapsed time: " << duration.count() << " ms" << std::endl; + + std::cout << "Downloading output (" << BATCH_SIZE << " samples)..." << std::endl; + std::vector h_output(BATCH_SIZE * OUTPUT_DIM); + RT_CHECK(vx_copy_from_dev(h_output.data(), output_buffer, 0, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE))); + + // CPU reference uses logical weights (with zeros for pruned elements) + std::cout << "Computing CPU reference..." << std::endl; + std::vector h_ref; + mlp_cpu_batch(h_input, h_weights_logical, h_biases, layer_dims, h_ref); + + std::cout << "Verifying results..." << std::endl; + int errors = 0; + for (int b = 0; b < BATCH_SIZE; ++b) { + for (int i = 0; i < OUTPUT_DIM; ++i) { + compare_float(h_output[b * OUTPUT_DIM + i], h_ref[b * OUTPUT_DIM + i], b, i, errors); + } + } + + std::cout << "Cleaning up..." << std::endl; + cleanup(); + + if (errors != 0) { + std::cout << "Found " << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + + std::cout << "PASSED!" << std::endl; + return 0; +} From 3f9e97b2e557058313c8503e9407ed46d6df60d4 Mon Sep 17 00:00:00 2001 From: tandonmitul27 Date: Thu, 5 Feb 2026 21:43:32 +0530 Subject: [PATCH 2/3] MLP test corrections --- tests/regression/mlp/common.h | 2 + tests/regression/mlp/kernel.cpp | 24 ++--- tests/regression/mlp_vegeta/kernel.cpp | 93 ++++++++++++-------- tests/regression/mlp_vegeta/main.cpp | 117 +++++++++++++++++++------ 4 files changed, 164 insertions(+), 72 deletions(-) diff --git a/tests/regression/mlp/common.h b/tests/regression/mlp/common.h index 7cc37012a..476f234b0 100644 --- a/tests/regression/mlp/common.h +++ b/tests/regression/mlp/common.h @@ -1,6 +1,8 @@ #ifndef _COMMON_H_ #define _COMMON_H_ +#include + #ifndef TYPE #define TYPE float #endif diff --git a/tests/regression/mlp/kernel.cpp b/tests/regression/mlp/kernel.cpp index 50e4ed24e..92ead2ec1 100644 --- a/tests/regression/mlp/kernel.cpp +++ b/tests/regression/mlp/kernel.cpp @@ -48,23 +48,23 @@ void softmax_find_max_thread(softmax_args_t* __UNIFORM__ args) { } } -// Thread function: compute exp in parallel, thread 0 sums -void softmax_exp_sum_thread(softmax_args_t* __UNIFORM__ args) { +// Thread function: compute exp in parallel +void softmax_exp_thread(softmax_args_t* __UNIFORM__ args) { uint32_t i = blockIdx.x; // All threads compute their exp value if (i < args->size) { args->data[i] = expf(args->data[i] - *(args->max_val)); } - - // Thread 0 does the sum reduction serially - if (i == 0) { - TYPE sum_exp = 0.0f; - for (uint32_t j = 0; j < args->size; ++j) { - sum_exp += args->data[j]; - } - *(args->sum_exp) = sum_exp; +} + +// Thread function: single thread sums all exp values +void softmax_sum_thread(softmax_args_t* __UNIFORM__ args) { + TYPE sum_exp = 0.0f; + for (uint32_t j = 0; j < args->size; ++j) { + sum_exp += args->data[j]; } + *(args->sum_exp) = sum_exp; } // Thread function: normalize in parallel @@ -86,7 +86,9 @@ void apply_softmax(TYPE* data, uint32_t size) { uint32_t single_thread = 1; vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_find_max_thread, &args); - vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_sum_thread, &args); + vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_thread, &args); + + vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_sum_thread, &args); vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_normalize_thread, &args); } diff --git a/tests/regression/mlp_vegeta/kernel.cpp b/tests/regression/mlp_vegeta/kernel.cpp index ad57212f0..ea516c616 100644 --- a/tests/regression/mlp_vegeta/kernel.cpp +++ b/tests/regression/mlp_vegeta/kernel.cpp @@ -6,21 +6,32 @@ // Aligned tile buffer for intermediate results static TYPE g_C_tile[TILE_SIZE * TILE_SIZE] __attribute__((aligned(64))); -// ReLU activation (in-place) for batched data +// ReLU activation (in-place) for tile-major batched data inline void apply_relu_batch(TYPE* data, uint32_t batch_size, uint32_t dim) { - for (uint32_t b = 0; b < batch_size; ++b) { - for (uint32_t i = 0; i < dim; ++i) { - TYPE& v = data[b * dim + i]; - if (v < 0) v = 0; - } + // Data is in tile-major format: process all elements + uint32_t total_elements = batch_size * dim; + for (uint32_t i = 0; i < total_elements; ++i) { + if (data[i] < 0) data[i] = 0; } } -// Softmax for each sample in batch +// Softmax for each sample in batch (tile-major layout) void apply_softmax_batch(TYPE* data, uint32_t batch_size, uint32_t dim) { + // Convert tile-major to row-major for softmax processing + // Since dim is OUTPUT_DIM=16 which is one tile, data layout is simple + int num_tile_cols = dim / TILE_SIZE; + for (uint32_t b = 0; b < batch_size; ++b) { - TYPE* sample = data + b * dim; + // Extract this sample's values across tiles + TYPE sample[OUTPUT_DIM]; + for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) { + for (int col = 0; col < TILE_SIZE; ++col) { + int tile_idx = tile_col * (batch_size * TILE_SIZE) + b * TILE_SIZE + col; + sample[tile_col * TILE_SIZE + col] = data[tile_idx]; + } + } + // Compute softmax TYPE max_val = sample[0]; for (uint32_t i = 1; i < dim; ++i) { if (sample[i] > max_val) max_val = sample[i]; @@ -35,6 +46,14 @@ void apply_softmax_batch(TYPE* data, uint32_t batch_size, uint32_t dim) { for (uint32_t i = 0; i < dim; ++i) { sample[i] /= sum_exp; } + + // Write back to tile-major + for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) { + for (int col = 0; col < TILE_SIZE; ++col) { + int tile_idx = tile_col * (batch_size * TILE_SIZE) + b * TILE_SIZE + col; + data[tile_idx] = sample[tile_col * TILE_SIZE + col]; + } + } } } @@ -67,30 +86,31 @@ void vegeta_layer_tgemm_batch( clear_C_tile(); for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { - TYPE* A_tile = input + in_t * TILE_SIZE; + // Tile-major layout: each tile column stores BATCH_SIZE×TILE_SIZE contiguously + TYPE* A_tile = input + in_t * (BATCH_SIZE * TILE_SIZE); TYPE* B_tile = weights + in_t * TILE_SIZE * out_dim + out_t * TILE_SIZE; vegeta_tgemm_tile_accumulate(A_tile, B_tile, g_C_tile); } - for (uint32_t row = 0; row < BATCH_SIZE; ++row) { - for (uint32_t col = 0; col < TILE_SIZE; ++col) { - output[row * out_dim + out_t * TILE_SIZE + col] = - g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; - } + // Store result in tile-major format + for (uint32_t i = 0; i < BATCH_SIZE * TILE_SIZE; ++i) { + output[out_t * (BATCH_SIZE * TILE_SIZE) + i] = + g_C_tile[i] + bias[out_t * TILE_SIZE + (i % TILE_SIZE)]; } } } // ============================================================================= -// UGEMM: Dense × 2:4 Sparse (T × U → T) +// UGEMM: Dense × 2:4 Sparse Weights (T × U → T) // For 2:4 sparsity: weights are compressed to half size, metadata indicates positions +// T-reg contains sparse weights (with metadata), U-reg contains dense activations // ============================================================================= void vegeta_ugemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) { vx_lt(0, (size_t)C_tile, 0); // Load C to T0 (accumulator) - vx_lt(1, (size_t)A_tile, 0); // Load A to T1 - vx_lm(1, (size_t)M_tile, 0); // Load metadata to M1 - vx_lu(2, (size_t)B_tile, 0); // Load B (2KB) to U2 - vx_ugemm(0, 1, 2); // T0 += T1 × U2 (with M1 metadata) + vx_lt(1, (size_t)B_tile, 0); // Load sparse weights B to T1 + vx_lm(1, (size_t)M_tile, 0); // Load metadata for weights into M1 + vx_lu(2, (size_t)A_tile, 0); // Load dense activations A to U2 + vx_ugemm(0, 1, 2); // T0 += T1(sparse weights) × U2(dense activations) with M1 metadata vx_st((size_t)C_tile, 0, 0); // Store result } @@ -105,7 +125,8 @@ void vegeta_layer_ugemm_batch( clear_C_tile(); for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { - TYPE* A_tile = input + in_t * TILE_SIZE; + // Tile-major input: U-reg loads 2×TILE_SIZE worth of activation data + TYPE* A_tile = input + in_t * (BATCH_SIZE * TILE_SIZE); // Compressed weights: half the K dimension TYPE* B_tile = weights + (in_t * TILE_SIZE / 2) * out_dim + out_t * TILE_SIZE; // Metadata: 128 bytes per tile @@ -113,25 +134,25 @@ void vegeta_layer_ugemm_batch( vegeta_ugemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile); } - for (uint32_t row = 0; row < BATCH_SIZE; ++row) { - for (uint32_t col = 0; col < TILE_SIZE; ++col) { - output[row * out_dim + out_t * TILE_SIZE + col] = - g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; - } + // Store result in tile-major format + for (uint32_t i = 0; i < BATCH_SIZE * TILE_SIZE; ++i) { + output[out_t * (BATCH_SIZE * TILE_SIZE) + i] = + g_C_tile[i] + bias[out_t * TILE_SIZE + (i % TILE_SIZE)]; } } } // ============================================================================= -// VGEMM: Dense × 1:4 Sparse (T × V → T) +// VGEMM: Dense × 1:4 Sparse Weights (T × V → T) // For 1:4 sparsity: weights are compressed to quarter size, metadata indicates positions +// T-reg contains sparse weights (with metadata), V-reg contains dense activations // ============================================================================= void vegeta_vgemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) { vx_lt(0, (size_t)C_tile, 0); // Load C to T0 (accumulator) - vx_lt(1, (size_t)A_tile, 0); // Load A to T1 - vx_lm(1, (size_t)M_tile, 0); // Load metadata to M1 - vx_lv(1, (size_t)B_tile, 0); // Load B (4KB) to V1 - vx_vgemm(0, 1, 1); // T0 += T1 × V1 (with M1 metadata) + vx_lt(1, (size_t)B_tile, 0); // Load compressed sparse weights B to T1 + vx_lm(1, (size_t)M_tile, 0); // Load metadata for sparse weights to M1 + vx_lv(1, (size_t)A_tile, 0); // Load dense activations A to V1 + vx_vgemm(0, 1, 1); // T0 += T1(sparse weights) × V1(dense activations) with M1 metadata vx_st((size_t)C_tile, 0, 0); // Store result } @@ -146,7 +167,8 @@ void vegeta_layer_vgemm_batch( clear_C_tile(); for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) { - TYPE* A_tile = input + in_t * TILE_SIZE; + // Tile-major input: V-reg loads 4×TILE_SIZE worth of activation data + TYPE* A_tile = input + in_t * (BATCH_SIZE * TILE_SIZE); // Compressed weights: quarter the K dimension TYPE* B_tile = weights + (in_t * TILE_SIZE / 4) * out_dim + out_t * TILE_SIZE; // Metadata: 128 bytes per tile @@ -154,11 +176,10 @@ void vegeta_layer_vgemm_batch( vegeta_vgemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile); } - for (uint32_t row = 0; row < BATCH_SIZE; ++row) { - for (uint32_t col = 0; col < TILE_SIZE; ++col) { - output[row * out_dim + out_t * TILE_SIZE + col] = - g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col]; - } + // Store result in tile-major format + for (uint32_t i = 0; i < BATCH_SIZE * TILE_SIZE; ++i) { + output[out_t * (BATCH_SIZE * TILE_SIZE) + i] = + g_C_tile[i] + bias[out_t * TILE_SIZE + (i % TILE_SIZE)]; } } } diff --git a/tests/regression/mlp_vegeta/main.cpp b/tests/regression/mlp_vegeta/main.cpp index a801b854c..ac9e4cf3b 100644 --- a/tests/regression/mlp_vegeta/main.cpp +++ b/tests/regression/mlp_vegeta/main.cpp @@ -84,6 +84,47 @@ void cleanup() { } } +/////////////////////////////////////////////////////////////////////////////// +// Tile-Major Layout Conversion +/////////////////////////////////////////////////////////////////////////////// + +// Convert row-major [batch × dim] to tile-major layout for VEGETA loads +// Input: row_major[batch × dim] where batch=16, dim is multiple of 16 +// Output: tile_major stores data as contiguous 16×16 tiles +// Memory layout: For each tile column, store all 16 rows contiguously +void convert_to_tile_major(const std::vector& row_major, int batch, int dim, + std::vector& tile_major) { + int num_tile_cols = dim / TILE_SIZE; + tile_major.resize(batch * dim); + + for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) { + for (int row = 0; row < batch; ++row) { + for (int col = 0; col < TILE_SIZE; ++col) { + int src_idx = row * dim + tile_col * TILE_SIZE + col; + int dst_idx = tile_col * (batch * TILE_SIZE) + row * TILE_SIZE + col; + tile_major[dst_idx] = row_major[src_idx]; + } + } + } +} + +// Convert tile-major back to row-major [batch × dim] +void convert_from_tile_major(const std::vector& tile_major, int batch, int dim, + std::vector& row_major) { + int num_tile_cols = dim / TILE_SIZE; + row_major.resize(batch * dim); + + for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) { + for (int row = 0; row < batch; ++row) { + for (int col = 0; col < TILE_SIZE; ++col) { + int src_idx = tile_col * (batch * TILE_SIZE) + row * TILE_SIZE + col; + int dst_idx = row * dim + tile_col * TILE_SIZE + col; + row_major[dst_idx] = tile_major[src_idx]; + } + } + } +} + /////////////////////////////////////////////////////////////////////////////// // Sparse Weight Compression /////////////////////////////////////////////////////////////////////////////// @@ -115,25 +156,38 @@ void compress_2_4_sparse(const std::vector& dense, int in_dim, int out_dim // Sort by magnitude (descending) std::sort(vals, vals + 4, [](auto& a, auto& b) { return a.first > b.first; }); - // Keep top 2, generate mask + // Create bitmask for top 2 values uint8_t mask = 0; for (int k = 0; k < 2; ++k) { int pos = vals[k].second; mask |= (1 << pos); - compressed[(comp_idx / 2) * out_dim + j] = dense[(i + pos) * out_dim + j]; - logical[(i + pos) * out_dim + j] = dense[(i + pos) * out_dim + j]; - comp_idx++; } - // Store metadata (simplified - in real impl would be tile-based) + // Store compressed values in POSITION ORDER (not magnitude order) + // Hardware iterates through bit positions 0-3 and expects values in that order + for (int pos = 0; pos < 4; ++pos) { + if (mask & (1 << pos)) { + compressed[comp_idx * out_dim + j] = dense[(i + pos) * out_dim + j]; + logical[(i + pos) * out_dim + j] = dense[(i + pos) * out_dim + j]; + comp_idx++; + } + } + + // Store metadata in 16×16 nibble format (128 bytes per tile) + // Each row has 8 bytes, each byte has 2 nibbles + // Byte layout per row: byte 0 = cols 0,1; byte 1 = cols 2,3; ...; byte 7 = cols 14,15 int tile_in = i / TILE_SIZE; int tile_out = j / TILE_SIZE; int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES; - int row_in_tile = (i % TILE_SIZE) / 4; - int col_in_tile = j % TILE_SIZE; - int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile; - if (byte_offset < (int)metadata.size()) { - metadata[byte_offset] = mask; + int row_in_tile = i % TILE_SIZE; + int k_grp = (i % TILE_SIZE) / 4; // Which group of 4 in this row + int byte_idx = meta_offset + row_in_tile * 8 + k_grp / 2; + if (byte_idx < (int)metadata.size()) { + if (k_grp % 2 == 0) { + metadata[byte_idx] = (mask << 4); // Upper nibble + } else { + metadata[byte_idx] |= mask; // Lower nibble + } } } } @@ -168,16 +222,21 @@ void compress_1_4_sparse(const std::vector& dense, int in_dim, int out_dim logical[(i + max_pos) * out_dim + j] = dense[(i + max_pos) * out_dim + j]; comp_idx++; - // Store metadata + // Store metadata in 16×16 nibble format (128 bytes per tile) + // Each row has 8 bytes, each byte has 2 nibbles uint8_t mask = (1 << max_pos); int tile_in = i / TILE_SIZE; int tile_out = j / TILE_SIZE; int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES; - int row_in_tile = (i % TILE_SIZE) / 4; - int col_in_tile = j % TILE_SIZE; - int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile; - if (byte_offset < (int)metadata.size()) { - metadata[byte_offset] = mask; + int row_in_tile = i % TILE_SIZE; + int k_grp = (i % TILE_SIZE) / 4; // Which group of 4 in this row + int byte_idx = meta_offset + row_in_tile * 8 + k_grp / 2; + if (byte_idx < (int)metadata.size()) { + if (k_grp % 2 == 0) { + metadata[byte_idx] = (mask << 4); // Upper nibble + } else { + metadata[byte_idx] |= mask; // Lower nibble + } } } } @@ -301,12 +360,16 @@ int main(int argc, char *argv[]) { std::cout << "Opening device connection..." << std::endl; RT_CHECK(vx_dev_open(&device)); - // Generate batched input - std::vector h_input(BATCH_SIZE * INPUT_DIM); - for (auto& v : h_input) { + // Generate batched input (row-major) + std::vector h_input_row_major(BATCH_SIZE * INPUT_DIM); + for (auto& v : h_input_row_major) { v = static_cast(rand()) / RAND_MAX; } + // Convert to tile-major for device + std::vector h_input_tile_major; + convert_to_tile_major(h_input_row_major, BATCH_SIZE, INPUT_DIM, h_input_tile_major); + // Generate weights, biases, and optional sparse compression std::vector> h_weights_dense(NUM_LAYERS); std::vector> h_weights_compressed(NUM_LAYERS); @@ -394,8 +457,8 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_mem_alloc(device, layer_configs.size() * sizeof(layer_config_t), VX_MEM_READ, &layer_configs_buffer)); RT_CHECK(vx_copy_to_dev(layer_configs_buffer, layer_configs.data(), 0, layer_configs.size() * sizeof(layer_config_t))); - std::cout << "Uploading input data (" << BATCH_SIZE << " samples)..." << std::endl; - RT_CHECK(vx_copy_to_dev(input_buffer, h_input.data(), 0, BATCH_SIZE * INPUT_DIM * sizeof(TYPE))); + std::cout << "Uploading input data (" << BATCH_SIZE << " samples, tile-major)..." << std::endl; + RT_CHECK(vx_copy_to_dev(input_buffer, h_input_tile_major.data(), 0, BATCH_SIZE * INPUT_DIM * sizeof(TYPE))); std::cout << "Uploading kernel binary..." << std::endl; RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); @@ -420,14 +483,18 @@ int main(int argc, char *argv[]) { auto duration = std::chrono::duration_cast(end - start); std::cout << "Elapsed time: " << duration.count() << " ms" << std::endl; - std::cout << "Downloading output (" << BATCH_SIZE << " samples)..." << std::endl; - std::vector h_output(BATCH_SIZE * OUTPUT_DIM); - RT_CHECK(vx_copy_from_dev(h_output.data(), output_buffer, 0, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE))); + std::cout << "Downloading output (" << BATCH_SIZE << " samples, tile-major)..." << std::endl; + std::vector h_output_tile_major(BATCH_SIZE * OUTPUT_DIM); + RT_CHECK(vx_copy_from_dev(h_output_tile_major.data(), output_buffer, 0, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE))); + + // Convert output back to row-major for comparison + std::vector h_output; + convert_from_tile_major(h_output_tile_major, BATCH_SIZE, OUTPUT_DIM, h_output); // CPU reference uses logical weights (with zeros for pruned elements) std::cout << "Computing CPU reference..." << std::endl; std::vector h_ref; - mlp_cpu_batch(h_input, h_weights_logical, h_biases, layer_dims, h_ref); + mlp_cpu_batch(h_input_row_major, h_weights_logical, h_biases, layer_dims, h_ref); std::cout << "Verifying results..." << std::endl; int errors = 0; From abd48054063a341eb5424525aea69344d916e4ca Mon Sep 17 00:00:00 2001 From: tandonmitul27 Date: Sun, 8 Feb 2026 20:04:33 +0530 Subject: [PATCH 3/3] Added a separate LSU for Vegeta Engine --- sim/simx/Makefile | 7 +- sim/simx/core.cpp | 1 + sim/simx/core.h | 6 ++ sim/simx/execute.cpp | 5 -- sim/simx/sparse_unit.cpp | 145 ++++++++++++++++++++++----------------- sim/simx/vegeta_lsu.cpp | 134 ++++++++++++++++++++++++++++++++++++ sim/simx/vegeta_lsu.h | 102 +++++++++++++++++++++++++++ 7 files changed, 330 insertions(+), 70 deletions(-) create mode 100644 sim/simx/vegeta_lsu.cpp create mode 100644 sim/simx/vegeta_lsu.h diff --git a/sim/simx/Makefile b/sim/simx/Makefile index bbb800612..43267ed74 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -27,8 +27,6 @@ SRCS += $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp SRCS += $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp SRCS += $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp -# sparse unit; add -DEXT_SPARSE_ENABLE flag later -SRCS += $(SRC_DIR)/sparse_unit.cpp # Add V extension sources ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),) @@ -42,6 +40,11 @@ endif ifneq ($(findstring -DEXT_TCU_ENABLE, $(CONFIGS)),) SRCS += $(SRC_DIR)/tensor_unit.cpp endif +# Add VEGETA extension sources +ifneq ($(findstring -DEXT_VEGETA_ENABLE, $(CONFIGS)),) + SRCS += $(SRC_DIR)/vegeta_lsu.cpp + SRCS += $(SRC_DIR)/sparse_unit.cpp +endif # Debugging ifdef DEBUG diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 49617318e..ba3644efd 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -64,6 +64,7 @@ Core::Core(const SimContext& ctx, #endif #ifdef EXT_VEGETA_ENABLE , sparse_unit_(SparseUnit::Create("spu", arch, this)) + , vegeta_lsu_(VegetaLsu::Create("vegeta_lsu", this, 1)) #endif , emulator_(arch, dcrs, this) , ibuffers_(arch.num_warps(), IBUF_SIZE) diff --git a/sim/simx/core.h b/sim/simx/core.h index dcc675a51..30d96296a 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -31,6 +31,7 @@ #endif #ifdef EXT_VEGETA_ENABLE #include "sparse_unit.h" +#include "vegeta_lsu.h" #endif #include "dispatcher.h" @@ -184,6 +185,10 @@ class Core : public SimObject { SparseUnit::Ptr& sparse_unit() { return sparse_unit_; } + + VegetaLsu::Ptr& vegeta_lsu() { + return vegeta_lsu_; + } #endif auto& trace_pool() { @@ -217,6 +222,7 @@ class Core : public SimObject { #ifdef EXT_VEGETA_ENABLE SparseUnit::Ptr sparse_unit_; + VegetaLsu::Ptr vegeta_lsu_; #endif Emulator emulator_; diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index 80c3132f9..7cb99d1fc 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -1556,7 +1556,6 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) { case VegetaTcuType::TILE_GEMM_T: { auto trace_data = std::make_shared(); trace->data = trace_data; - assert(warp.tmask.count() == num_threads); // Extract tile register indices from instruction uint32_t dst_reg = rdest.idx; @@ -1570,7 +1569,6 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) { case VegetaTcuType::TILE_GEMM_U: { auto trace_data = std::make_shared(); trace->data = trace_data; - assert(warp.tmask.count() == num_threads); // Extract tile register indices from instruction uint32_t dst_reg = rdest.idx; @@ -1585,7 +1583,6 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) { case VegetaTcuType::TILE_GEMM_V: { auto trace_data = std::make_shared(); trace->data = trace_data; - assert(warp.tmask.count() == num_threads); // Extract tile register indices from instruction uint32_t dst_reg = rdest.idx; @@ -1599,7 +1596,6 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) { case VegetaTcuType::TILE_GEMM_R: { auto trace_data = std::make_shared(); trace->data = trace_data; - assert(warp.tmask.count() == num_threads); // Extract tile register indices from instruction uint32_t dst_reg = rdest.idx; @@ -1614,7 +1610,6 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) { auto tpuArgs = std::get(instrArgs); auto trace_data = std::make_shared(); trace->data = trace_data; - assert(warp.tmask.count() == num_threads); // Get metadata from integer registers a0-a7 (x10-x17) for sparse fragA // These contain metadata values loaded by mma_sync into a0-a7 diff --git a/sim/simx/sparse_unit.cpp b/sim/simx/sparse_unit.cpp index 275146b23..2740c391e 100644 --- a/sim/simx/sparse_unit.cpp +++ b/sim/simx/sparse_unit.cpp @@ -16,6 +16,7 @@ #include "sparse_cfg.h" #include #include "core.h" +#include "vegeta_lsu.h" #include using namespace vortex; @@ -772,90 +773,97 @@ class SparseUnit::Impl { uint32_t tile_reg_idx = vd; assert(tile_reg_idx < tile_reg_file_.size() && "Tile register index out of bounds"); auto &tile_reg = tile_reg_file_[tile_reg_idx]; - constexpr uint32_t ELEMENT_SIZE = sizeof(typename vt::fp32::dtype); // 4 bytes for fp32 base_addr &= 0xFFFFFFFC; // Align to word boundary for fp32 loads - // Load tile from memory: 16 rows x 16 columns = 256 fp32 elements = 1024 bytes + // Use VegetaLsu for bulk tile load (1KB) + constexpr uint32_t T_TILE_SIZE = TILE_DIM * TILE_DIM * sizeof(float); + float tile_buffer[TILE_DIM * TILE_DIM]; + core_->vegeta_lsu()->load_tile(base_addr, VegetaLsu::TileType::T_TILE, + tile_reg_idx, wid, tid, tile_buffer); + + // Copy from linear buffer to 2D tile register for (uint32_t row = 0; row < TILE_DIM; ++row) { for (uint32_t col = 0; col < TILE_DIM; ++col) { - uint64_t mem_addr = base_addr + (row * TILE_DIM + col) * ELEMENT_SIZE; - uint32_t mem_data = 0; - core_->dcache_read(&mem_data, mem_addr, ELEMENT_SIZE); - trace_data->mem_addrs.at(tid).push_back({mem_addr, ELEMENT_SIZE}); - - // Interpret as float and store in tile register - float value; - std::memcpy(&value, &mem_data, ELEMENT_SIZE); - tile_reg[row][col] = value; + tile_reg[row][col] = tile_buffer[row * TILE_DIM + col]; } } - DP(2, "TILE_LOAD_T: wid=" << wid << ", tid=" << tid + // Record trace for all elements + constexpr uint32_t ELEMENT_SIZE = sizeof(float); + for (uint32_t i = 0; i < TILE_DIM * TILE_DIM; ++i) { + trace_data->mem_addrs.at(tid).push_back({base_addr + i * ELEMENT_SIZE, ELEMENT_SIZE}); + } + + DP(2, "TILE_LOAD_T (via VegetaLsu): wid=" << wid << ", tid=" << tid << ", tile_reg_idx=" << tile_reg_idx << ", base_addr=0x" << std::hex << base_addr << std::dec); break; } case VegetaLsuType::TILE_LOAD_U: { // tile_load_u: DestReg contains ureg index, map to tile registers - // ureg 0 -> tile reg 0, 1 + // ureg 0 -> tile reg 0, 1 (2KB total = 2 T-tiles) std::vector target_tregs = map_ureg_to_treg(vd); base_addr &= 0xFFFFFFFC; // Align to word boundary for fp32 loads - constexpr uint32_t ELEMENT_SIZE = sizeof(typename vt::fp32::dtype); - uint64_t current_addr = base_addr; - for (uint32_t treg_idx : target_tregs) { + // Use VegetaLsu for bulk U-tile load (2KB) + constexpr uint32_t T_TILE_ELEMENTS = TILE_DIM * TILE_DIM; + float tile_buffer[T_TILE_ELEMENTS * 2]; // 2 T-tiles for U-reg + core_->vegeta_lsu()->load_tile(base_addr, VegetaLsu::TileType::U_TILE, + vd, wid, tid, tile_buffer); + + // Copy from linear buffer to 2D tile registers + for (uint32_t t = 0; t < target_tregs.size(); ++t) { + uint32_t treg_idx = target_tregs[t]; assert(treg_idx < tile_reg_file_.size() && "Tile register index out of bounds"); auto &tile_reg = tile_reg_file_[treg_idx]; - - // Load tile from memory: 16 rows x 16 columns = 256 fp32 elements = 1024 bytes for (uint32_t row = 0; row < TILE_DIM; ++row) { for (uint32_t col = 0; col < TILE_DIM; ++col) { - uint64_t mem_addr = current_addr + (row * TILE_DIM + col) * ELEMENT_SIZE; - uint32_t mem_data = 0; - core_->dcache_read(&mem_data, mem_addr, ELEMENT_SIZE); - trace_data->mem_addrs.at(tid).push_back({mem_addr, ELEMENT_SIZE}); - - float value; - std::memcpy(&value, &mem_data, ELEMENT_SIZE); - tile_reg[row][col] = value; + tile_reg[row][col] = tile_buffer[t * T_TILE_ELEMENTS + row * TILE_DIM + col]; } } - current_addr += TILE_DIM * TILE_DIM * ELEMENT_SIZE; // Move to next tile (1KB) } - DP(2, "TILE_LOAD_U: wid=" << wid << ", tid=" << tid + // Record trace for all elements + constexpr uint32_t ELEMENT_SIZE = sizeof(float); + for (uint32_t i = 0; i < T_TILE_ELEMENTS * 2; ++i) { + trace_data->mem_addrs.at(tid).push_back({base_addr + i * ELEMENT_SIZE, ELEMENT_SIZE}); + } + + DP(2, "TILE_LOAD_U (via VegetaLsu): wid=" << wid << ", tid=" << tid << ", ureg_idx=" << vd << ", target_tregs=[" << target_tregs[0] << ", " << target_tregs[1] << "], base_addr=0x" << std::hex << base_addr << std::dec); break; } case VegetaLsuType::TILE_LOAD_V: { // tile_load_v: DestReg contains vreg index, map to tile registers - // vreg 0 -> tile reg 0, 1, 2, 3 + // vreg 0 -> tile reg 0, 1, 2, 3 (4KB total = 4 T-tiles) std::vector target_tregs = map_vreg_to_treg(vd); base_addr &= 0xFFFFFFFC; // Align to word boundary for fp32 loads - constexpr uint32_t ELEMENT_SIZE = sizeof(typename vt::fp32::dtype); - uint64_t current_addr = base_addr; - for (uint32_t treg_idx : target_tregs) { + // Use VegetaLsu for bulk V-tile load (4KB) + constexpr uint32_t T_TILE_ELEMENTS = TILE_DIM * TILE_DIM; + float tile_buffer[T_TILE_ELEMENTS * 4]; // 4 T-tiles for V-reg + core_->vegeta_lsu()->load_tile(base_addr, VegetaLsu::TileType::V_TILE, + vd, wid, tid, tile_buffer); + + // Copy from linear buffer to 2D tile registers + for (uint32_t t = 0; t < target_tregs.size(); ++t) { + uint32_t treg_idx = target_tregs[t]; assert(treg_idx < tile_reg_file_.size() && "Tile register index out of bounds"); auto &tile_reg = tile_reg_file_[treg_idx]; - - // Load tile from memory: 16 rows x 16 columns = 256 fp32 elements = 1024 bytes for (uint32_t row = 0; row < TILE_DIM; ++row) { for (uint32_t col = 0; col < TILE_DIM; ++col) { - uint64_t mem_addr = current_addr + (row * TILE_DIM + col) * ELEMENT_SIZE; - uint32_t mem_data = 0; - core_->dcache_read(&mem_data, mem_addr, ELEMENT_SIZE); - trace_data->mem_addrs.at(tid).push_back({mem_addr, ELEMENT_SIZE}); - - float value; - std::memcpy(&value, &mem_data, ELEMENT_SIZE); - tile_reg[row][col] = value; + tile_reg[row][col] = tile_buffer[t * T_TILE_ELEMENTS + row * TILE_DIM + col]; } } - current_addr += TILE_DIM * TILE_DIM * ELEMENT_SIZE; // Move to next tile (1KB) } - DP(2, "TILE_LOAD_V: wid=" << wid << ", tid=" << tid + // Record trace for all elements + constexpr uint32_t ELEMENT_SIZE = sizeof(float); + for (uint32_t i = 0; i < T_TILE_ELEMENTS * 4; ++i) { + trace_data->mem_addrs.at(tid).push_back({base_addr + i * ELEMENT_SIZE, ELEMENT_SIZE}); + } + + DP(2, "TILE_LOAD_V (via VegetaLsu): wid=" << wid << ", tid=" << tid << ", vreg_idx=" << vd << ", target_tregs=[" << target_tregs[0] << ", " << target_tregs[1] << ", " << target_tregs[2] << ", " << target_tregs[3] << "], base_addr=0x" << std::hex << base_addr << std::dec); @@ -867,22 +875,28 @@ class SparseUnit::Impl { assert(meta_reg_idx < metadata_reg_file_.size() && "Metadata register index out of bounds"); auto &metadata_reg = metadata_reg_file_[meta_reg_idx]; - // Load metadata from memory: 16 rows x 16 columns = 256 uint4 elements = 128 bytes + // Use VegetaLsu for bulk M-tile load (128 bytes) + constexpr uint32_t M_TILE_SIZE = 128; + uint8_t meta_buffer[M_TILE_SIZE]; + core_->vegeta_lsu()->load_tile(base_addr, VegetaLsu::TileType::M_TILE, + meta_reg_idx, wid, tid, meta_buffer); + + // Parse nibbles from linear buffer into metadata register // Each byte stores two uint4 values: upper nibble for col N, lower nibble for col N+1 for (uint32_t row = 0; row < TILE_DIM; ++row) { for (uint32_t col = 0; col < TILE_DIM; col += 2) { - uint64_t mem_addr = base_addr + (row * (TILE_DIM / 2) + col / 2); - uint8_t mem_data = 0; - core_->dcache_read(&mem_data, mem_addr, 1); - trace_data->mem_addrs.at(tid).push_back({mem_addr, 1}); - - // Upper nibble for col N, lower nibble for col N+1 - metadata_reg[row][col] = (mem_data >> 4) & 0x0F; - metadata_reg[row][col + 1] = mem_data & 0x0F; + uint8_t byte = meta_buffer[row * (TILE_DIM / 2) + col / 2]; + metadata_reg[row][col] = (byte >> 4) & 0x0F; + metadata_reg[row][col + 1] = byte & 0x0F; } } + + // Record trace for all bytes + for (uint32_t i = 0; i < M_TILE_SIZE; ++i) { + trace_data->mem_addrs.at(tid).push_back({base_addr + i, 1}); + } - DP(2, "TILE_LOAD_M: wid=" << wid << ", tid=" << tid + DP(2, "TILE_LOAD_M (via VegetaLsu): wid=" << wid << ", tid=" << tid << ", metadata_reg_idx=" << meta_reg_idx << ", base_addr=0x" << std::hex << base_addr << std::dec); break; } @@ -911,21 +925,26 @@ class SparseUnit::Impl { assert(vs3 < tile_reg_file_.size() && "Tile register index out of bounds"); auto &tile_reg = tile_reg_file_[vs3]; constexpr uint32_t TILE_DIM = 16; - constexpr uint32_t ELEMENT_SIZE = sizeof(typename vt::fp32::dtype); // 4 bytes for fp32 - // Store tile to memory: 16 rows x 16 columns = 256 fp32 elements = 1024 bytes + // Copy 2D tile register to linear buffer for VegetaLsu + float tile_buffer[TILE_DIM * TILE_DIM]; for (uint32_t row = 0; row < TILE_DIM; ++row) { for (uint32_t col = 0; col < TILE_DIM; ++col) { - uint64_t mem_addr = base_addr + (row * TILE_DIM + col) * ELEMENT_SIZE; - float value = tile_reg[row][col]; - uint32_t mem_data = 0; - std::memcpy(&mem_data, &value, ELEMENT_SIZE); - core_->dcache_write(&mem_data, mem_addr, ELEMENT_SIZE); - trace_data->mem_addrs.at(tid).push_back({mem_addr, ELEMENT_SIZE}); + tile_buffer[row * TILE_DIM + col] = tile_reg[row][col]; } } + + // Use VegetaLsu for bulk tile store (1KB) + core_->vegeta_lsu()->store_tile(base_addr, VegetaLsu::TileType::T_TILE, + vs3, wid, tid, tile_buffer); + + // Record trace for all elements + constexpr uint32_t ELEMENT_SIZE = sizeof(float); + for (uint32_t i = 0; i < TILE_DIM * TILE_DIM; ++i) { + trace_data->mem_addrs.at(tid).push_back({base_addr + i * ELEMENT_SIZE, ELEMENT_SIZE}); + } - DP(2, "TILE_STORE: wid=" << wid << ", tid=" << tid << ", vs3=" << vs3 + DP(2, "TILE_STORE (via VegetaLsu): wid=" << wid << ", tid=" << tid << ", vs3=" << vs3 << ", base_addr=0x" << std::hex << base_addr << std::dec); #else std::abort(); // EXT_VEGETA_ENABLE required for store operations diff --git a/sim/simx/vegeta_lsu.cpp b/sim/simx/vegeta_lsu.cpp new file mode 100644 index 000000000..7bc782b0e --- /dev/null +++ b/sim/simx/vegeta_lsu.cpp @@ -0,0 +1,134 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "vegeta_lsu.h" +#include "core.h" +#include "debug.h" +#include + +using namespace vortex; + +// Tile sizes in bytes +static constexpr uint32_t T_TILE_SIZE = 1024; // 16x16 x 4 bytes +static constexpr uint32_t U_TILE_SIZE = 2048; // 2 T-tiles +static constexpr uint32_t V_TILE_SIZE = 4096; // 4 T-tiles +static constexpr uint32_t M_TILE_SIZE = 128; // 16x16 x 0.5 bytes (nibbles) + +// Cache line size (from config) +static constexpr uint32_t CACHE_LINE_SIZE = 64; + +VegetaLsu::VegetaLsu(const SimContext& ctx, const char* name, Core* core, uint32_t num_ports) + : SimObject(ctx, name) + , dcache_req_ports(num_ports, this) + , dcache_rsp_ports(num_ports, this) + , core_(core) + , num_ports_(num_ports) + , perf_stats_() +{} + +VegetaLsu::~VegetaLsu() {} + +void VegetaLsu::reset() { + perf_stats_ = PerfStats(); +} + +void VegetaLsu::tick() { + // Process cache responses (for future async implementation) + for (uint32_t p = 0; p < num_ports_; ++p) { + auto& rsp_port = dcache_rsp_ports.at(p); + if (!rsp_port.empty()) { + // For now, we use synchronous access via core_->dcache_read/write + rsp_port.pop(); + } + } +} + +uint32_t VegetaLsu::tile_size(TileType type) { + switch (type) { + case TileType::T_TILE: return T_TILE_SIZE; + case TileType::U_TILE: return U_TILE_SIZE; + case TileType::V_TILE: return V_TILE_SIZE; + case TileType::M_TILE: return M_TILE_SIZE; + default: return 0; + } +} + +void VegetaLsu::load_tile(uint64_t addr, TileType type, uint32_t reg_idx, + uint32_t wid, uint32_t tid, void* reg_data) { + __unused(reg_idx, wid, tid); + + uint32_t size = tile_size(type); + uint8_t* data = reinterpret_cast(reg_data); + + // Align address to cache line boundary for efficient access + uint64_t aligned_addr = addr & ~(uint64_t)(CACHE_LINE_SIZE - 1); + uint32_t offset = addr - aligned_addr; + + DP(2, "VEGETA_LSU: load_tile type=" << (int)type << " addr=0x" << std::hex << addr + << " size=" << std::dec << size << " reg=" << reg_idx); + + // Issue cache-line sized requests + uint32_t bytes_loaded = 0; + uint64_t current_addr = addr; + + while (bytes_loaded < size) { + uint32_t chunk_size = std::min(size - bytes_loaded, CACHE_LINE_SIZE); + + // Use core's dcache_read for each cache line + core_->dcache_read(data + bytes_loaded, current_addr, chunk_size); + + ++perf_stats_.cache_requests; + bytes_loaded += chunk_size; + current_addr += chunk_size; + } + + ++perf_stats_.tile_loads; + perf_stats_.total_bytes += size; +} + +void VegetaLsu::store_tile(uint64_t addr, TileType type, uint32_t reg_idx, + uint32_t wid, uint32_t tid, const void* reg_data) { + __unused(reg_idx, wid, tid); + + uint32_t size = tile_size(type); + const uint8_t* data = reinterpret_cast(reg_data); + + DP(2, "VEGETA_LSU: store_tile type=" << (int)type << " addr=0x" << std::hex << addr + << " size=" << std::dec << size << " reg=" << reg_idx); + + // Issue cache-line sized requests + uint32_t bytes_stored = 0; + uint64_t current_addr = addr; + + while (bytes_stored < size) { + uint32_t chunk_size = std::min(size - bytes_stored, CACHE_LINE_SIZE); + + // Use core's dcache_write for each cache line + core_->dcache_write(data + bytes_stored, current_addr, chunk_size); + + ++perf_stats_.cache_requests; + bytes_stored += chunk_size; + current_addr += chunk_size; + } + + ++perf_stats_.tile_stores; + perf_stats_.total_bytes += size; +} + +void VegetaLsu::issue_tile_requests(const TileRequest& req, void* data) { + if (req.is_store) { + store_tile(req.addr, req.type, req.reg_idx, req.wid, req.tid, data); + } else { + load_tile(req.addr, req.type, req.reg_idx, req.wid, req.tid, data); + } +} diff --git a/sim/simx/vegeta_lsu.h b/sim/simx/vegeta_lsu.h new file mode 100644 index 000000000..df27a14a3 --- /dev/null +++ b/sim/simx/vegeta_lsu.h @@ -0,0 +1,102 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "types.h" +#include "instr_trace.h" + +namespace vortex { + +class Core; + +// VEGETA Tile LSU - Dedicated Load/Store Unit for tile operations +// Handles bulk tile transfers (T-tile: 1KB, M-tile: 128B) via L1 D-Cache +class VegetaLsu : public SimObject { +public: + // Tile request types + enum class TileType { + T_TILE = 0, // 16x16 float = 1024 bytes + U_TILE = 1, // 2 T-tiles = 2048 bytes + V_TILE = 2, // 4 T-tiles = 4096 bytes + M_TILE = 3 // 16x16 nibbles = 128 bytes (metadata) + }; + + // Tile request structure + struct TileRequest { + uint64_t addr; // Base address + TileType type; // Tile type + uint32_t reg_idx; // Destination/source register index + bool is_store; // true = store, false = load + uint32_t wid; // Warp ID + uint32_t tid; // Thread ID + }; + + struct PerfStats { + uint64_t tile_loads; + uint64_t tile_stores; + uint64_t cache_requests; + uint64_t total_bytes; + + PerfStats() + : tile_loads(0) + , tile_stores(0) + , cache_requests(0) + , total_bytes(0) + {} + + PerfStats& operator+=(const PerfStats& rhs) { + this->tile_loads += rhs.tile_loads; + this->tile_stores += rhs.tile_stores; + this->cache_requests += rhs.cache_requests; + this->total_bytes += rhs.total_bytes; + return *this; + } + }; + + // Memory request/response ports for L1 cache connection + std::vector> dcache_req_ports; + std::vector> dcache_rsp_ports; + + VegetaLsu(const SimContext& ctx, const char* name, Core* core, uint32_t num_ports); + ~VegetaLsu(); + + void reset(); + void tick(); + + // Load tile from memory to register + void load_tile(uint64_t addr, TileType type, uint32_t reg_idx, + uint32_t wid, uint32_t tid, void* reg_data); + + // Store tile from register to memory + void store_tile(uint64_t addr, TileType type, uint32_t reg_idx, + uint32_t wid, uint32_t tid, const void* reg_data); + + // Get tile size in bytes + static uint32_t tile_size(TileType type); + + const PerfStats& perf_stats() const { return perf_stats_; } + +private: + Core* core_; + uint32_t num_ports_; + PerfStats perf_stats_; + + // Issue cache-line requests for a tile + void issue_tile_requests(const TileRequest& req, void* data); +}; + +} // namespace vortex