From ba5fae8d43dcea731bacc4f3901aacc5ef813365 Mon Sep 17 00:00:00 2001
From: tandonmitul27 <tandonmitul27>
Date: Thu, 5 Feb 2026 00:16:03 +0530
Subject: [PATCH 1/3] Added MLP tests

---
 tests/regression/mlp/Makefile          |  14 +
 tests/regression/mlp/common.h          |  38 +++
 tests/regression/mlp/kernel.cpp        | 162 +++++++++
 tests/regression/mlp/main.cpp          | 346 +++++++++++++++++++
 tests/regression/mlp_vegeta/Makefile   |  14 +
 tests/regression/mlp_vegeta/common.h   |  55 +++
 tests/regression/mlp_vegeta/kernel.cpp | 220 ++++++++++++
 tests/regression/mlp_vegeta/main.cpp   | 451 +++++++++++++++++++++++++
 8 files changed, 1300 insertions(+)
 create mode 100644 tests/regression/mlp/Makefile
 create mode 100644 tests/regression/mlp/common.h
 create mode 100644 tests/regression/mlp/kernel.cpp
 create mode 100644 tests/regression/mlp/main.cpp
 create mode 100644 tests/regression/mlp_vegeta/Makefile
 create mode 100644 tests/regression/mlp_vegeta/common.h
 create mode 100644 tests/regression/mlp_vegeta/kernel.cpp
 create mode 100644 tests/regression/mlp_vegeta/main.cpp
diff --git a/tests/regression/mlp/Makefile b/tests/regression/mlp/Makefile
new file mode 100644
index 000000000..d50062bac
--- /dev/null
+++ b/tests/regression/mlp/Makefile
@@ -0,0 +1,14 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := mlp
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?=
+
+include ../common.mk
diff --git a/tests/regression/mlp/common.h b/tests/regression/mlp/common.h
new file mode 100644
index 000000000..7cc37012a
--- /dev/null
+++ b/tests/regression/mlp/common.h
@@ -0,0 +1,38 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+// MLP Network Configuration (all dimensions multiples of 16)
+#define INPUT_DIM   128
+#define HIDDEN1_DIM 256
+#define HIDDEN2_DIM 128
+#define HIDDEN3_DIM 64
+#define OUTPUT_DIM  16
+
+// Number of layers
+#define NUM_LAYERS  4
+
+// Layer configuration structure
+typedef struct {
+    uint32_t input_dim;
+    uint32_t output_dim;
+    uint64_t weights_addr;  // Weight matrix: output_dim x input_dim
+    uint64_t bias_addr;     // Bias vector: output_dim
+} layer_config_t;
+
+// Kernel arguments
+typedef struct {
+    uint32_t num_layers;
+    uint32_t batch_size;        // Number of input samples
+    uint64_t input_addr;        // Input data
+    uint64_t output_addr;       // Final output
+    uint64_t layer_configs_addr; // Pointer to layer configurations
+    // Intermediate buffers for layer outputs
+    uint64_t buffer1_addr;      // For layer 1 output / layer 2 input
+    uint64_t buffer2_addr;      // For layer 2 output / layer 3 input
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/mlp/kernel.cpp b/tests/regression/mlp/kernel.cpp
new file mode 100644
index 000000000..50e4ed24e
--- /dev/null
+++ b/tests/regression/mlp/kernel.cpp
@@ -0,0 +1,162 @@
+#include <vx_spawn.h>
+#include <vx_intrinsics.h>
+#include <math.h>
+#include "common.h"
+
+// This kernel is designed for single-core parallel execution (scales with warps/threads). Multi-core would require inter-core synchronization between layers, which Vortex's vx_spawn_threads doesn't provide yet (to the best of my knowledge)...
+
+// ReLU activation function
+inline TYPE relu(TYPE x) {
+    return (x > 0) ? x : 0;
+}
+
+// Layer execution arguments passed to spawned threads
+typedef struct {
+    TYPE* input;
+    TYPE* weights;
+    TYPE* bias;
+    TYPE* output;
+    uint32_t input_dim;
+    uint32_t output_dim;
+    bool apply_relu;
+} layer_args_t;
+
+
+// Softmax execution arguments
+typedef struct {
+    TYPE* data;
+    uint32_t size;
+    TYPE* max_val;
+    TYPE* sum_exp;
+} softmax_args_t;
+
+// Global shared variables for softmax reductions
+TYPE g_softmax_max_val;
+TYPE g_softmax_sum_exp;
+
+// Thread function: thread 0 finds max, others wait
+void softmax_find_max_thread(softmax_args_t* __UNIFORM__ args) {
+    if (blockIdx.x == 0) {
+        // Thread 0 does the reduction serially
+        TYPE max_val = args->data[0];
+        for (uint32_t i = 1; i < args->size; ++i) {
+            if (args->data[i] > max_val) {
+                max_val = args->data[i];
+            }
+        }
+        *(args->max_val) = max_val;
+    }
+}
+
+// Thread function: compute exp in parallel, thread 0 sums
+void softmax_exp_sum_thread(softmax_args_t* __UNIFORM__ args) {
+    uint32_t i = blockIdx.x;
+    
+    // All threads compute their exp value
+    if (i < args->size) {
+        args->data[i] = expf(args->data[i] - *(args->max_val));
+    }
+    
+    // Thread 0 does the sum reduction serially
+    if (i == 0) {
+        TYPE sum_exp = 0.0f;
+        for (uint32_t j = 0; j < args->size; ++j) {
+            sum_exp += args->data[j];
+        }
+        *(args->sum_exp) = sum_exp;
+    }
+}
+
+// Thread function: normalize in parallel
+void softmax_normalize_thread(softmax_args_t* __UNIFORM__ args) {
+    uint32_t i = blockIdx.x;
+    if (i < args->size) {
+        args->data[i] /= *(args->sum_exp);
+    }
+}
+
+// Softmax implementation
+void apply_softmax(TYPE* data, uint32_t size) {
+    softmax_args_t args;
+    args.data = data;
+    args.size = size;
+    args.max_val = &g_softmax_max_val;
+    args.sum_exp = &g_softmax_sum_exp;
+    
+    uint32_t single_thread = 1;
+    vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_find_max_thread, &args);
+
+    vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_sum_thread, &args);
+    
+    vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_normalize_thread, &args);
+}
+
+// Each thread computes one output neuron
+// y[i] = sum_j(W[i][j] * x[j]) + b[i]
+void fc_layer_thread(layer_args_t* __UNIFORM__ args) {
+    uint32_t i = blockIdx.x;  // Output neuron index
+    
+    if (i >= args->output_dim) return;
+    
+    TYPE sum = 0;
+    
+    // Compute dot product: W[i] · input
+    for (uint32_t j = 0; j < args->input_dim; ++j) {
+        sum += args->weights[i * args->input_dim + j] * args->input[j];
+    }
+    
+    // Add bias
+    sum += args->bias[i];
+    
+    // Apply ReLU activation (except for output layer)
+    if (args->apply_relu) {
+        sum = relu(sum);
+    }
+    
+    args->output[i] = sum;
+}
+
+// MLP inference kernel
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+    auto layer_configs = reinterpret_cast<layer_config_t*>(arg->layer_configs_addr);
+    
+    // Layer buffers
+    auto input = reinterpret_cast<TYPE*>(arg->input_addr);
+    auto buffer1 = reinterpret_cast<TYPE*>(arg->buffer1_addr);
+    auto buffer2 = reinterpret_cast<TYPE*>(arg->buffer2_addr);
+    auto output = reinterpret_cast<TYPE*>(arg->output_addr);
+    
+    // Pointer arrays for ping-pong buffering
+    TYPE* layer_inputs[4] = {input, buffer1, buffer2, buffer1};
+    TYPE* layer_outputs[4] = {buffer1, buffer2, buffer1, output};
+    
+    layer_args_t layer_args;
+    
+    // Execute each layer with parallel threads
+    for (uint32_t layer = 0; layer < arg->num_layers; ++layer) {
+        auto cfg = &layer_configs[layer];
+        
+        layer_args.input = layer_inputs[layer];
+        layer_args.weights = reinterpret_cast<TYPE*>(cfg->weights_addr);
+        layer_args.bias = reinterpret_cast<TYPE*>(cfg->bias_addr);
+        layer_args.output = layer_outputs[layer];
+        layer_args.input_dim = cfg->input_dim;
+        layer_args.output_dim = cfg->output_dim;
+        layer_args.apply_relu = (layer < arg->num_layers - 1);
+        
+        // vx_spawn_threads provides barrier synchronization after all threads complete
+        vx_spawn_threads(1, &layer_args.output_dim, nullptr, 
+                        (vx_kernel_func_cb)fc_layer_thread, &layer_args);
+    }
+    
+    // Apply softmax to final output
+    uint32_t output_size = layer_configs[arg->num_layers - 1].output_dim;
+    apply_softmax(output, output_size);
+}
+
+int main() {
+    kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+    
+    uint32_t grid_dim = 1;
+    return vx_spawn_threads(1, &grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/mlp/main.cpp b/tests/regression/mlp/main.cpp
new file mode 100644
index 000000000..ca1d7373c
--- /dev/null
+++ b/tests/regression/mlp/main.cpp
@@ -0,0 +1,346 @@
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vector>
+#include <cmath>
+#include <chrono>
+#include <vortex.h>
+#include "common.h"
+
+#define FLOAT_ULP 150  // Higher tolerance for deep network (4 layers, accumulated FP errors)
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+     cleanup();                                                 \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+const char* kernel_file = "kernel.vxbin";
+
+// Device handles
+vx_device_h device = nullptr;
+vx_buffer_h input_buffer = nullptr;
+vx_buffer_h output_buffer = nullptr;
+vx_buffer_h buffer1 = nullptr;
+vx_buffer_h buffer2 = nullptr;
+vx_buffer_h krnl_buffer = nullptr;
+vx_buffer_h args_buffer = nullptr;
+vx_buffer_h layer_configs_buffer = nullptr;
+
+// Weight and bias buffers for each layer
+vx_buffer_h weights_buffers[NUM_LAYERS];
+vx_buffer_h bias_buffers[NUM_LAYERS];
+
+kernel_arg_t kernel_arg = {};
+
+// Layer dimensions
+const uint32_t layer_input_dims[NUM_LAYERS] = {INPUT_DIM, HIDDEN1_DIM, HIDDEN2_DIM, HIDDEN3_DIM};
+const uint32_t layer_output_dims[NUM_LAYERS] = {HIDDEN1_DIM, HIDDEN2_DIM, HIDDEN3_DIM, OUTPUT_DIM};
+
+static void show_usage() {
+   std::cout << "Vortex MLP Test." << std::endl;
+//    std::cout << "Usage: [-k: kernel] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "k:h")) != -1) {
+    switch (c) {
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+      show_usage();
+      exit(0);
+      break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(input_buffer);
+    vx_mem_free(output_buffer);
+    vx_mem_free(buffer1);
+    vx_mem_free(buffer2);
+    vx_mem_free(layer_configs_buffer);
+    for (int i = 0; i < NUM_LAYERS; ++i) {
+      vx_mem_free(weights_buffers[i]);
+      vx_mem_free(bias_buffers[i]);
+    }
+    vx_mem_free(krnl_buffer);
+    vx_mem_free(args_buffer);
+    vx_dev_close(device);
+  }
+}
+
+// Initialize weights with Xavier initialization
+void init_weights(std::vector<TYPE>& weights, uint32_t fan_in, uint32_t fan_out) {
+    TYPE scale = std::sqrt(2.0f / (fan_in + fan_out));
+    for (size_t i = 0; i < weights.size(); ++i) {
+        // Generate random value between -scale and scale
+        weights[i] = (static_cast<TYPE>(rand()) / RAND_MAX * 2.0f - 1.0f) * scale;
+    }
+}
+
+// Initialize biases to zero
+void init_bias(std::vector<TYPE>& bias) {
+    for (size_t i = 0; i < bias.size(); ++i) {
+        bias[i] = 0.0f;
+    }
+}
+
+// CPU reference implementation of ReLU
+TYPE relu_cpu(TYPE x) {
+    return (x > 0) ? x : 0;
+}
+
+// CPU reference implementation of fully connected layer
+void fc_layer_cpu(const TYPE* input, const TYPE* weights, const TYPE* bias,
+                  TYPE* output, uint32_t input_dim, uint32_t output_dim, bool apply_relu) {
+    for (uint32_t i = 0; i < output_dim; ++i) {
+        TYPE sum = 0;
+        for (uint32_t j = 0; j < input_dim; ++j) {
+            sum += weights[i * input_dim + j] * input[j];
+        }
+        sum += bias[i];
+        output[i] = apply_relu ? relu_cpu(sum) : sum;
+    }
+}
+
+// CPU reference implementation of softmax
+void softmax_cpu(TYPE* data, uint32_t size) {
+    // Find max for numerical stability
+    TYPE max_val = data[0];
+    for (uint32_t i = 1; i < size; ++i) {
+        if (data[i] > max_val) {
+            max_val = data[i];
+        }
+    }
+    
+    // Compute exp(x - max) and sum
+    TYPE sum_exp = 0.0f;
+    for (uint32_t i = 0; i < size; ++i) {
+        data[i] = std::exp(data[i] - max_val);
+        sum_exp += data[i];
+    }
+    
+    // Normalize
+    for (uint32_t i = 0; i < size; ++i) {
+        data[i] /= sum_exp;
+    }
+}
+
+// Run MLP inference on CPU for reference
+void mlp_cpu(const TYPE* input, TYPE* output,
+             const std::vector<std::vector<TYPE>>& weights,
+             const std::vector<std::vector<TYPE>>& biases) {
+    
+    std::vector<TYPE> layer_output[NUM_LAYERS];
+    
+    for (int l = 0; l < NUM_LAYERS; ++l) {
+        layer_output[l].resize(layer_output_dims[l]);
+        
+        const TYPE* layer_input = (l == 0) ? input : layer_output[l-1].data();
+        bool apply_relu = (l < NUM_LAYERS - 1);
+        
+        fc_layer_cpu(layer_input, weights[l].data(), biases[l].data(),
+                    layer_output[l].data(), layer_input_dims[l], layer_output_dims[l], apply_relu);
+    }
+    
+    // Copy final output
+    memcpy(output, layer_output[NUM_LAYERS-1].data(), OUTPUT_DIM * sizeof(TYPE));
+    
+    // Apply softmax to final output
+    softmax_cpu(output, OUTPUT_DIM);
+}
+
+// Compare floating point values with ULP tolerance
+bool compare_float(TYPE a, TYPE b, int index, int& errors) {
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+        if (errors < 100) {
+            printf("*** error: [%d] expected=%f, actual=%f (diff=%d ULP)\n", index, b, a, d);
+        }
+        return false;
+    }
+    return true;
+}
+
+int main(int argc, char *argv[]) {
+    parse_args(argc, argv);
+
+    std::srand(42);  // Fixed seed for reproducible results across config changes
+
+    std::cout << "=== Vortex MLP Neural Network Test ===" << std::endl;
+    std::cout << "Network: " << INPUT_DIM << " -> " << HIDDEN1_DIM 
+              << " -> " << HIDDEN2_DIM << " -> " << HIDDEN3_DIM 
+              << " -> " << OUTPUT_DIM << std::endl;
+
+    // Open device connection
+    std::cout << "Opening device connection..." << std::endl;
+    RT_CHECK(vx_dev_open(&device));
+
+    // Allocate host buffers for weights and biases
+    std::vector<std::vector<TYPE>> h_weights(NUM_LAYERS);
+    std::vector<std::vector<TYPE>> h_biases(NUM_LAYERS);
+    
+    for (int l = 0; l < NUM_LAYERS; ++l) {
+        uint32_t in_dim = layer_input_dims[l];
+        uint32_t out_dim = layer_output_dims[l];
+        
+        h_weights[l].resize(out_dim * in_dim);
+        h_biases[l].resize(out_dim);
+        
+        init_weights(h_weights[l], in_dim, out_dim);
+        init_bias(h_biases[l]);
+    }
+
+    // Allocate host input/output buffers
+    std::vector<TYPE> h_input(INPUT_DIM);
+    std::vector<TYPE> h_output(OUTPUT_DIM);
+    std::vector<TYPE> h_ref_output(OUTPUT_DIM);
+
+    // Initialize input with random values
+    for (uint32_t i = 0; i < INPUT_DIM; ++i) {
+        h_input[i] = static_cast<TYPE>(rand()) / RAND_MAX;
+    }
+
+    // Print input summary
+    // std::cout << "Input (first 8 values): ";
+    // for (int i = 0; i < 8; ++i) {
+    //     std::cout << h_input[i] << " ";
+    // }
+    // std::cout << "..." << std::endl;
+
+    // Allocate device memory for input/output
+    std::cout << "Allocating device memory..." << std::endl;
+    
+    RT_CHECK(vx_mem_alloc(device, INPUT_DIM * sizeof(TYPE), VX_MEM_READ, &input_buffer));
+    RT_CHECK(vx_mem_address(input_buffer, &kernel_arg.input_addr));
+    
+    RT_CHECK(vx_mem_alloc(device, OUTPUT_DIM * sizeof(TYPE), VX_MEM_READ_WRITE, &output_buffer));
+    RT_CHECK(vx_mem_address(output_buffer, &kernel_arg.output_addr));
+    
+    // Intermediate buffers (sized for largest hidden layer)
+    uint32_t max_hidden = std::max(HIDDEN1_DIM, std::max(HIDDEN2_DIM, HIDDEN3_DIM));
+    RT_CHECK(vx_mem_alloc(device, max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer1));
+    RT_CHECK(vx_mem_address(buffer1, &kernel_arg.buffer1_addr));
+    
+    RT_CHECK(vx_mem_alloc(device, max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer2));
+    RT_CHECK(vx_mem_address(buffer2, &kernel_arg.buffer2_addr));
+
+    // Allocate and upload weights and biases
+    std::vector<layer_config_t> layer_configs(NUM_LAYERS);
+    
+    for (int l = 0; l < NUM_LAYERS; ++l) {
+        uint32_t in_dim = layer_input_dims[l];
+        uint32_t out_dim = layer_output_dims[l];
+        
+        // Allocate weights
+        RT_CHECK(vx_mem_alloc(device, out_dim * in_dim * sizeof(TYPE), VX_MEM_READ, &weights_buffers[l]));
+        RT_CHECK(vx_mem_address(weights_buffers[l], &layer_configs[l].weights_addr));
+        RT_CHECK(vx_copy_to_dev(weights_buffers[l], h_weights[l].data(), 0, out_dim * in_dim * sizeof(TYPE)));
+        
+        // Allocate biases
+        RT_CHECK(vx_mem_alloc(device, out_dim * sizeof(TYPE), VX_MEM_READ, &bias_buffers[l]));
+        RT_CHECK(vx_mem_address(bias_buffers[l], &layer_configs[l].bias_addr));
+        RT_CHECK(vx_copy_to_dev(bias_buffers[l], h_biases[l].data(), 0, out_dim * sizeof(TYPE)));
+        
+        layer_configs[l].input_dim = in_dim;
+        layer_configs[l].output_dim = out_dim;
+        
+        std::cout << "  Layer " << l << ": " << in_dim << " x " << out_dim 
+                  << " (weights: " << (out_dim * in_dim * sizeof(TYPE)) << " bytes)" << std::endl;
+    }
+
+    // Upload layer configs
+    RT_CHECK(vx_mem_alloc(device, NUM_LAYERS * sizeof(layer_config_t), VX_MEM_READ, &layer_configs_buffer));
+    RT_CHECK(vx_mem_address(layer_configs_buffer, &kernel_arg.layer_configs_addr));
+    RT_CHECK(vx_copy_to_dev(layer_configs_buffer, layer_configs.data(), 0, NUM_LAYERS * sizeof(layer_config_t)));
+
+    // Upload input
+    std::cout << "Uploading input data..." << std::endl;
+    RT_CHECK(vx_copy_to_dev(input_buffer, h_input.data(), 0, INPUT_DIM * sizeof(TYPE)));
+
+    // Set kernel arguments
+    kernel_arg.num_layers = NUM_LAYERS;
+    kernel_arg.batch_size = 1;
+
+    // Upload kernel binary
+    std::cout << "Uploading kernel binary..." << std::endl;
+    RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+
+    // Upload kernel arguments
+    std::cout << "Uploading kernel arguments..." << std::endl;
+    RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
+
+    // Start device
+    std::cout << "Starting MLP inference on device..." << std::endl;
+    auto time_start = std::chrono::high_resolution_clock::now();
+    
+    RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
+
+    // Wait for completion
+    RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+    
+    auto time_end = std::chrono::high_resolution_clock::now();
+    double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+    std::cout << "Elapsed time: " << elapsed << " ms" << std::endl;
+
+    // Download output
+    std::cout << "Downloading output..." << std::endl;
+    RT_CHECK(vx_copy_from_dev(h_output.data(), output_buffer, 0, OUTPUT_DIM * sizeof(TYPE)));
+
+    // Compute CPU reference
+    std::cout << "Computing CPU reference..." << std::endl;
+    mlp_cpu(h_input.data(), h_ref_output.data(), h_weights, h_biases);
+
+    // Print outputs
+    // std::cout << "Device output: ";
+    // for (int i = 0; i < OUTPUT_DIM; ++i) {
+    //     std::cout << h_output[i] << " ";
+    // }
+    // std::cout << std::endl;
+
+    // std::cout << "CPU reference: ";
+    // for (int i = 0; i < OUTPUT_DIM; ++i) {
+    //     std::cout << h_ref_output[i] << " ";
+    // }
+    // std::cout << std::endl;
+
+    // Verify results
+    std::cout << "Verifying results..." << std::endl;
+    int errors = 0;
+    for (uint32_t i = 0; i < OUTPUT_DIM; ++i) {
+        if (!compare_float(h_output[i], h_ref_output[i], i, errors)) {
+            ++errors;
+        }
+    }
+
+    // Cleanup
+    std::cout << "Cleaning up..." << std::endl;
+    cleanup();
+
+    if (errors != 0) {
+        std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+        std::cout << "FAILED!" << std::endl;
+        return 1;
+    }
+
+    std::cout << "PASSED!" << std::endl;
+    return 0;
+}
diff --git a/tests/regression/mlp_vegeta/Makefile b/tests/regression/mlp_vegeta/Makefile
new file mode 100644
index 000000000..b2b36e7cd
--- /dev/null
+++ b/tests/regression/mlp_vegeta/Makefile
@@ -0,0 +1,14 @@
+ROOT_DIR := $(realpath ../../..)
+include $(ROOT_DIR)/config.mk
+
+PROJECT := mlp_vegeta
+
+SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
+
+SRCS := $(SRC_DIR)/main.cpp
+
+VX_SRCS := $(SRC_DIR)/kernel.cpp
+
+OPTS ?=
+
+include ../common.mk
diff --git a/tests/regression/mlp_vegeta/common.h b/tests/regression/mlp_vegeta/common.h
new file mode 100644
index 000000000..ebbf9c6d3
--- /dev/null
+++ b/tests/regression/mlp_vegeta/common.h
@@ -0,0 +1,55 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#include <stdint.h>
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+// MLP Network Configuration (same as mlp_test, all multiples of 16)
+#define INPUT_DIM   128
+#define HIDDEN1_DIM 256
+#define HIDDEN2_DIM 128
+#define HIDDEN3_DIM 64
+#define OUTPUT_DIM  16
+
+// Number of layers
+#define NUM_LAYERS  4
+
+// VEGETA tile dimensions
+#define TILE_SIZE 16
+#define BATCH_SIZE 16  // VEGETA requires batch=16 for proper 16x16 tiles
+#define T_TILE_BYTES (TILE_SIZE * TILE_SIZE * sizeof(TYPE))  // 1KB
+#define U_TILE_BYTES (2 * T_TILE_BYTES)  // 2KB
+#define V_TILE_BYTES (4 * T_TILE_BYTES)  // 4KB
+#define M_TILE_BYTES (TILE_SIZE * TILE_SIZE / 2)  // 128 bytes metadata
+
+// GEMM modes for VEGETA
+typedef enum {
+    MLP_MODE_TGEMM = 0,  // Dense × Dense (T × T)
+    MLP_MODE_UGEMM = 1,  // Dense × 2:4 Sparse (T × U)
+    MLP_MODE_VGEMM = 2   // Dense × 1:4 Sparse (T × V)
+} mlp_mode_t;
+
+// Layer configuration structure
+typedef struct {
+    uint32_t input_dim;
+    uint32_t output_dim;
+    uint64_t weights_addr;   // Weight matrix: input_dim x output_dim (row-major)
+    uint64_t bias_addr;      // Bias vector: output_dim
+    uint64_t metadata_addr;  // Sparsity metadata (for UGEMM/VGEMM modes)
+} layer_config_t;
+
+// Kernel arguments
+typedef struct {
+    uint32_t num_layers;
+    uint32_t mode;              // MLP_MODE_TGEMM, MLP_MODE_UGEMM, or MLP_MODE_VGEMM
+    uint64_t input_addr;        // Input data: [BATCH_SIZE x INPUT_DIM]
+    uint64_t output_addr;       // Final output: [BATCH_SIZE x OUTPUT_DIM]
+    uint64_t layer_configs_addr; // Pointer to layer configurations
+    uint64_t buffer1_addr;      // Intermediate buffer 1
+    uint64_t buffer2_addr;      // Intermediate buffer 2
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/mlp_vegeta/kernel.cpp b/tests/regression/mlp_vegeta/kernel.cpp
new file mode 100644
index 000000000..ad57212f0
--- /dev/null
+++ b/tests/regression/mlp_vegeta/kernel.cpp
@@ -0,0 +1,220 @@
+#include <vx_spawn.h>
+#include <vx_intrinsics.h>
+#include <cmath>
+#include "common.h"
+
+// Aligned tile buffer for intermediate results 
+static TYPE g_C_tile[TILE_SIZE * TILE_SIZE] __attribute__((aligned(64)));
+
+// ReLU activation (in-place) for batched data
+inline void apply_relu_batch(TYPE* data, uint32_t batch_size, uint32_t dim) {
+    for (uint32_t b = 0; b < batch_size; ++b) {
+        for (uint32_t i = 0; i < dim; ++i) {
+            TYPE& v = data[b * dim + i];
+            if (v < 0) v = 0;
+        }
+    }
+}
+
+// Softmax for each sample in batch
+void apply_softmax_batch(TYPE* data, uint32_t batch_size, uint32_t dim) {
+    for (uint32_t b = 0; b < batch_size; ++b) {
+        TYPE* sample = data + b * dim;
+        
+        TYPE max_val = sample[0];
+        for (uint32_t i = 1; i < dim; ++i) {
+            if (sample[i] > max_val) max_val = sample[i];
+        }
+        
+        TYPE sum_exp = 0.0f;
+        for (uint32_t i = 0; i < dim; ++i) {
+            sample[i] = expf(sample[i] - max_val);
+            sum_exp += sample[i];
+        }
+        
+        for (uint32_t i = 0; i < dim; ++i) {
+            sample[i] /= sum_exp;
+        }
+    }
+}
+
+// Initialize C tile buffer to zeros
+inline void clear_C_tile() {
+    for (uint32_t i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
+        g_C_tile[i] = 0.0f;
+    }
+}
+
+// =============================================================================
+// TGEMM: Dense × Dense (T × T → T)
+// =============================================================================
+void vegeta_tgemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, TYPE* C_tile) {
+    vx_lt(0, (size_t)C_tile, 0);  // Load C to T0 (accumulator)
+    vx_lt(1, (size_t)A_tile, 0);  // Load A to T1
+    vx_lt(2, (size_t)B_tile, 0);  // Load B to T2
+    vx_tgemm(0, 1, 2);             // T0 += T1 × T2
+    vx_st((size_t)C_tile, 0, 0);  // Store result
+}
+
+void vegeta_layer_tgemm_batch(
+    TYPE* input, TYPE* weights, TYPE* bias, TYPE* output,
+    uint32_t in_dim, uint32_t out_dim
+) {
+    uint32_t in_tiles = in_dim / TILE_SIZE;
+    uint32_t out_tiles = out_dim / TILE_SIZE;
+    
+    for (uint32_t out_t = 0; out_t < out_tiles; ++out_t) {
+        clear_C_tile();
+        
+        for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) {
+            TYPE* A_tile = input + in_t * TILE_SIZE;
+            TYPE* B_tile = weights + in_t * TILE_SIZE * out_dim + out_t * TILE_SIZE;
+            vegeta_tgemm_tile_accumulate(A_tile, B_tile, g_C_tile);
+        }
+        
+        for (uint32_t row = 0; row < BATCH_SIZE; ++row) {
+            for (uint32_t col = 0; col < TILE_SIZE; ++col) {
+                output[row * out_dim + out_t * TILE_SIZE + col] = 
+                    g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col];
+            }
+        }
+    }
+}
+
+// =============================================================================
+// UGEMM: Dense × 2:4 Sparse (T × U → T)
+// For 2:4 sparsity: weights are compressed to half size, metadata indicates positions
+// =============================================================================
+void vegeta_ugemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) {
+    vx_lt(0, (size_t)C_tile, 0);  // Load C to T0 (accumulator)
+    vx_lt(1, (size_t)A_tile, 0);  // Load A to T1
+    vx_lm(1, (size_t)M_tile, 0);  // Load metadata to M1
+    vx_lu(2, (size_t)B_tile, 0);  // Load B (2KB) to U2
+    vx_ugemm(0, 1, 2);             // T0 += T1 × U2 (with M1 metadata)
+    vx_st((size_t)C_tile, 0, 0);  // Store result
+}
+
+void vegeta_layer_ugemm_batch(
+    TYPE* input, TYPE* weights, uint8_t* metadata, TYPE* bias, TYPE* output,
+    uint32_t in_dim, uint32_t out_dim
+) {
+    uint32_t in_tiles = in_dim / TILE_SIZE;
+    uint32_t out_tiles = out_dim / TILE_SIZE;
+    
+    for (uint32_t out_t = 0; out_t < out_tiles; ++out_t) {
+        clear_C_tile();
+        
+        for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) {
+            TYPE* A_tile = input + in_t * TILE_SIZE;
+            // Compressed weights: half the K dimension
+            TYPE* B_tile = weights + (in_t * TILE_SIZE / 2) * out_dim + out_t * TILE_SIZE;
+            // Metadata: 128 bytes per tile
+            uint8_t* M_tile = metadata + (in_t * out_tiles + out_t) * M_TILE_BYTES;
+            vegeta_ugemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile);
+        }
+        
+        for (uint32_t row = 0; row < BATCH_SIZE; ++row) {
+            for (uint32_t col = 0; col < TILE_SIZE; ++col) {
+                output[row * out_dim + out_t * TILE_SIZE + col] = 
+                    g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col];
+            }
+        }
+    }
+}
+
+// =============================================================================
+// VGEMM: Dense × 1:4 Sparse (T × V → T)
+// For 1:4 sparsity: weights are compressed to quarter size, metadata indicates positions
+// =============================================================================
+void vegeta_vgemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) {
+    vx_lt(0, (size_t)C_tile, 0);  // Load C to T0 (accumulator)
+    vx_lt(1, (size_t)A_tile, 0);  // Load A to T1
+    vx_lm(1, (size_t)M_tile, 0);  // Load metadata to M1
+    vx_lv(1, (size_t)B_tile, 0);  // Load B (4KB) to V1
+    vx_vgemm(0, 1, 1);             // T0 += T1 × V1 (with M1 metadata)
+    vx_st((size_t)C_tile, 0, 0);  // Store result
+}
+
+void vegeta_layer_vgemm_batch(
+    TYPE* input, TYPE* weights, uint8_t* metadata, TYPE* bias, TYPE* output,
+    uint32_t in_dim, uint32_t out_dim
+) {
+    uint32_t in_tiles = in_dim / TILE_SIZE;
+    uint32_t out_tiles = out_dim / TILE_SIZE;
+    
+    for (uint32_t out_t = 0; out_t < out_tiles; ++out_t) {
+        clear_C_tile();
+        
+        for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) {
+            TYPE* A_tile = input + in_t * TILE_SIZE;
+            // Compressed weights: quarter the K dimension
+            TYPE* B_tile = weights + (in_t * TILE_SIZE / 4) * out_dim + out_t * TILE_SIZE;
+            // Metadata: 128 bytes per tile
+            uint8_t* M_tile = metadata + (in_t * out_tiles + out_t) * M_TILE_BYTES;
+            vegeta_vgemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile);
+        }
+        
+        for (uint32_t row = 0; row < BATCH_SIZE; ++row) {
+            for (uint32_t col = 0; col < TILE_SIZE; ++col) {
+                output[row * out_dim + out_t * TILE_SIZE + col] = 
+                    g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col];
+            }
+        }
+    }
+}
+
+// =============================================================================
+// Kernel Entry Point
+// =============================================================================
+void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
+    auto input = reinterpret_cast<TYPE*>(arg->input_addr);
+    auto output = reinterpret_cast<TYPE*>(arg->output_addr);
+    auto layer_configs = reinterpret_cast<layer_config_t*>(arg->layer_configs_addr);
+    auto buffer1 = reinterpret_cast<TYPE*>(arg->buffer1_addr);
+    auto buffer2 = reinterpret_cast<TYPE*>(arg->buffer2_addr);
+    uint32_t mode = arg->mode;
+    
+    TYPE* layer_input = input;
+    TYPE* layer_output = buffer1;
+    
+    for (uint32_t layer = 0; layer < arg->num_layers; ++layer) {
+        auto& config = layer_configs[layer];
+        auto weights = reinterpret_cast<TYPE*>(config.weights_addr);
+        auto bias = reinterpret_cast<TYPE*>(config.bias_addr);
+        auto metadata = reinterpret_cast<uint8_t*>(config.metadata_addr);
+        
+        if (layer == arg->num_layers - 1) {
+            layer_output = output;
+        } else if (layer % 2 == 0) {
+            layer_output = buffer1;
+        } else {
+            layer_output = buffer2;
+        }
+        
+        // Execute layer based on mode
+        if (mode == MLP_MODE_TGEMM) {
+            vegeta_layer_tgemm_batch(layer_input, weights, bias, layer_output,
+                                     config.input_dim, config.output_dim);
+        } else if (mode == MLP_MODE_UGEMM) {
+            vegeta_layer_ugemm_batch(layer_input, weights, metadata, bias, layer_output,
+                                     config.input_dim, config.output_dim);
+        } else if (mode == MLP_MODE_VGEMM) {
+            vegeta_layer_vgemm_batch(layer_input, weights, metadata, bias, layer_output,
+                                     config.input_dim, config.output_dim);
+        }
+        
+        // Apply ReLU for hidden layers
+        if (layer < arg->num_layers - 1) {
+            apply_relu_batch(layer_output, BATCH_SIZE, config.output_dim);
+        }
+        
+        layer_input = layer_output;
+    }
+    
+    apply_softmax_batch(output, BATCH_SIZE, OUTPUT_DIM);
+}
+
+int main() {
+    kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
+    return vx_spawn_threads(1, nullptr, nullptr, (vx_kernel_func_cb)kernel_body, arg);
+}
diff --git a/tests/regression/mlp_vegeta/main.cpp b/tests/regression/mlp_vegeta/main.cpp
new file mode 100644
index 000000000..a801b854c
--- /dev/null
+++ b/tests/regression/mlp_vegeta/main.cpp
@@ -0,0 +1,451 @@
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vector>
+#include <chrono>
+#include <cmath>
+#include <algorithm>
+#include <vortex.h>
+#include "common.h"
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+     cleanup();                                                 \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+const char* kernel_file = "kernel.vxbin";
+
+vx_device_h device = nullptr;
+vx_buffer_h input_buffer = nullptr;
+vx_buffer_h output_buffer = nullptr;
+vx_buffer_h layer_configs_buffer = nullptr;
+vx_buffer_h buffer1 = nullptr;
+vx_buffer_h buffer2 = nullptr;
+std::vector<vx_buffer_h> weight_buffers;
+std::vector<vx_buffer_h> bias_buffers;
+std::vector<vx_buffer_h> metadata_buffers;
+vx_buffer_h krnl_buffer = nullptr;
+vx_buffer_h args_buffer = nullptr;
+
+static mlp_mode_t mlp_mode = MLP_MODE_TGEMM;
+
+static void show_usage() {
+   std::cout << "Vortex VEGETA MLP Test (Batched)." << std::endl;
+//    std::cout << "Usage: [-m mode] [-h: help]" << std::endl;
+//    std::cout << "  -m mode: GEMM mode (0=TGEMM, 1=UGEMM, 2=VGEMM) [default: 0]" << std::endl;
+//    std::cout << "    TGEMM (0): Dense × Dense" << std::endl;
+//    std::cout << "    UGEMM (1): Dense × 2:4 Sparse" << std::endl;
+//    std::cout << "    VGEMM (2): Dense × 1:4 Sparse" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "m:h")) != -1) {
+    switch (c) {
+    case 'm':
+      mlp_mode = static_cast<mlp_mode_t>(atoi(optarg));
+      if (mlp_mode < MLP_MODE_TGEMM || mlp_mode > MLP_MODE_VGEMM) {
+        std::cerr << "Error: Invalid mode " << mlp_mode << std::endl;
+        show_usage();
+        exit(-1);
+      }
+      break;
+    case 'h':
+      show_usage();
+      exit(0);
+      break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+    if (device) {
+        vx_mem_free(input_buffer);
+        vx_mem_free(output_buffer);
+        vx_mem_free(layer_configs_buffer);
+        vx_mem_free(buffer1);
+        vx_mem_free(buffer2);
+        for (auto& buf : weight_buffers) vx_mem_free(buf);
+        for (auto& buf : bias_buffers) vx_mem_free(buf);
+        for (auto& buf : metadata_buffers) vx_mem_free(buf);
+        vx_mem_free(krnl_buffer);
+        vx_mem_free(args_buffer);
+        vx_dev_close(device);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Sparse Weight Compression
+///////////////////////////////////////////////////////////////////////////////
+
+// Compress weights to 2:4 sparsity (keep 2 largest of every 4)
+// Input: dense weights [in_dim x out_dim]
+// Output: compressed weights [in_dim/2 x out_dim], metadata
+void compress_2_4_sparse(const std::vector<TYPE>& dense, int in_dim, int out_dim,
+                         std::vector<TYPE>& compressed, std::vector<uint8_t>& metadata,
+                         std::vector<TYPE>& logical) {
+    // For the CPU reference, we need the "logical" dense weights with zeros
+    logical.resize(in_dim * out_dim, 0.0f);
+    compressed.resize((in_dim / 2) * out_dim);
+    
+    // Metadata: 128 bytes per 16x16 tile
+    int in_tiles = in_dim / TILE_SIZE;
+    int out_tiles = out_dim / TILE_SIZE;
+    metadata.resize(in_tiles * out_tiles * M_TILE_BYTES, 0);
+    
+    // For each group of 4 consecutive values in K dimension, keep 2 largest
+    for (int j = 0; j < out_dim; ++j) {
+        int comp_idx = 0;
+        for (int i = 0; i < in_dim; i += 4) {
+            // Get 4 values
+            std::pair<float, int> vals[4];
+            for (int k = 0; k < 4; ++k) {
+                vals[k] = {std::abs(dense[(i + k) * out_dim + j]), k};
+            }
+            // Sort by magnitude (descending)
+            std::sort(vals, vals + 4, [](auto& a, auto& b) { return a.first > b.first; });
+            
+            // Keep top 2, generate mask
+            uint8_t mask = 0;
+            for (int k = 0; k < 2; ++k) {
+                int pos = vals[k].second;
+                mask |= (1 << pos);
+                compressed[(comp_idx / 2) * out_dim + j] = dense[(i + pos) * out_dim + j];
+                logical[(i + pos) * out_dim + j] = dense[(i + pos) * out_dim + j];
+                comp_idx++;
+            }
+            
+            // Store metadata (simplified - in real impl would be tile-based)
+            int tile_in = i / TILE_SIZE;
+            int tile_out = j / TILE_SIZE;
+            int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES;
+            int row_in_tile = (i % TILE_SIZE) / 4;
+            int col_in_tile = j % TILE_SIZE;
+            int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile;
+            if (byte_offset < (int)metadata.size()) {
+                metadata[byte_offset] = mask;
+            }
+        }
+    }
+}
+
+// Compress weights to 1:4 sparsity (keep 1 largest of every 4)
+void compress_1_4_sparse(const std::vector<TYPE>& dense, int in_dim, int out_dim,
+                         std::vector<TYPE>& compressed, std::vector<uint8_t>& metadata,
+                         std::vector<TYPE>& logical) {
+    logical.resize(in_dim * out_dim, 0.0f);
+    compressed.resize((in_dim / 4) * out_dim);
+    
+    int in_tiles = in_dim / TILE_SIZE;
+    int out_tiles = out_dim / TILE_SIZE;
+    metadata.resize(in_tiles * out_tiles * M_TILE_BYTES, 0);
+    
+    for (int j = 0; j < out_dim; ++j) {
+        int comp_idx = 0;
+        for (int i = 0; i < in_dim; i += 4) {
+            // Find largest of 4
+            int max_pos = 0;
+            float max_val = std::abs(dense[i * out_dim + j]);
+            for (int k = 1; k < 4; ++k) {
+                float val = std::abs(dense[(i + k) * out_dim + j]);
+                if (val > max_val) {
+                    max_val = val;
+                    max_pos = k;
+                }
+            }
+            
+            compressed[comp_idx * out_dim + j] = dense[(i + max_pos) * out_dim + j];
+            logical[(i + max_pos) * out_dim + j] = dense[(i + max_pos) * out_dim + j];
+            comp_idx++;
+            
+            // Store metadata
+            uint8_t mask = (1 << max_pos);
+            int tile_in = i / TILE_SIZE;
+            int tile_out = j / TILE_SIZE;
+            int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES;
+            int row_in_tile = (i % TILE_SIZE) / 4;
+            int col_in_tile = j % TILE_SIZE;
+            int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile;
+            if (byte_offset < (int)metadata.size()) {
+                metadata[byte_offset] = mask;
+            }
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// CPU Reference
+///////////////////////////////////////////////////////////////////////////////
+
+static inline TYPE relu_cpu(TYPE x) {
+    return (x > 0) ? x : 0;
+}
+
+static void softmax_cpu(TYPE* sample, int dim) {
+    TYPE max_val = sample[0];
+    for (int i = 1; i < dim; ++i) {
+        if (sample[i] > max_val) max_val = sample[i];
+    }
+    TYPE sum_exp = 0.0f;
+    for (int i = 0; i < dim; ++i) {
+        sample[i] = expf(sample[i] - max_val);
+        sum_exp += sample[i];
+    }
+    for (int i = 0; i < dim; ++i) {
+        sample[i] /= sum_exp;
+    }
+}
+
+static void layer_cpu_batch(const TYPE* input, const TYPE* weights, const TYPE* bias,
+                            TYPE* output, int batch_size, int input_dim, int output_dim) {
+    for (int b = 0; b < batch_size; ++b) {
+        for (int j = 0; j < output_dim; ++j) {
+            TYPE sum = bias[j];
+            for (int i = 0; i < input_dim; ++i) {
+                sum += input[b * input_dim + i] * weights[i * output_dim + j];
+            }
+            output[b * output_dim + j] = sum;
+        }
+    }
+}
+
+static void mlp_cpu_batch(const std::vector<TYPE>& input,
+                          const std::vector<std::vector<TYPE>>& weights,
+                          const std::vector<std::vector<TYPE>>& biases,
+                          const std::vector<std::pair<int,int>>& dims,
+                          std::vector<TYPE>& output) {
+    std::vector<TYPE> layer_input = input;
+    std::vector<TYPE> layer_output;
+    
+    for (size_t layer = 0; layer < dims.size(); ++layer) {
+        int in_dim = dims[layer].first;
+        int out_dim = dims[layer].second;
+        layer_output.resize(BATCH_SIZE * out_dim);
+        
+        layer_cpu_batch(layer_input.data(), weights[layer].data(), 
+                        biases[layer].data(), layer_output.data(),
+                        BATCH_SIZE, in_dim, out_dim);
+        
+        if (layer < dims.size() - 1) {
+            for (auto& v : layer_output) {
+                v = relu_cpu(v);
+            }
+        }
+        
+        layer_input = layer_output;
+    }
+    
+    output = layer_output;
+    for (int b = 0; b < BATCH_SIZE; ++b) {
+        softmax_cpu(&output[b * OUTPUT_DIM], OUTPUT_DIM);
+    }
+}
+
+// Compare floats with relative tolerance
+static bool compare_float(TYPE a, TYPE b, int batch, int idx, int& errors) {
+    constexpr float REL_TOL = 0.05f;  // 5% relative tolerance
+    constexpr float ABS_TOL = 1e-6f;
+    
+    float diff = std::abs(a - b);
+    float max_val = std::max(std::abs(a), std::abs(b));
+    
+    if (diff > REL_TOL * max_val + ABS_TOL) {
+        if (errors < 10) {
+            printf("*** error: [batch=%d, idx=%d] expected=%.6f, actual=%.6f (diff=%.2f%%)\n", 
+                   batch, idx, b, a, 100.0f * diff / max_val);
+        }
+        ++errors;
+        return false;
+    }
+    return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char *argv[]) {
+    parse_args(argc, argv);
+    
+    std::srand(50);
+    
+    std::vector<std::pair<int,int>> layer_dims = {
+        {INPUT_DIM, HIDDEN1_DIM},
+        {HIDDEN1_DIM, HIDDEN2_DIM},
+        {HIDDEN2_DIM, HIDDEN3_DIM},
+        {HIDDEN3_DIM, OUTPUT_DIM}
+    };
+    
+    const char* mode_name;
+    switch (mlp_mode) {
+        case MLP_MODE_TGEMM: mode_name = "TGEMM (Dense × Dense)"; break;
+        case MLP_MODE_UGEMM: mode_name = "UGEMM (Dense × 2:4 Sparse)"; break;
+        case MLP_MODE_VGEMM: mode_name = "VGEMM (Dense × 1:4 Sparse)"; break;
+        default: mode_name = "Unknown";
+    }
+
+    std::cout << "=== Vortex VEGETA MLP Neural Network Test (Batched) ===" << std::endl;
+    std::cout << "Network: " << INPUT_DIM << " -> " << HIDDEN1_DIM << " -> " 
+              << HIDDEN2_DIM << " -> " << HIDDEN3_DIM << " -> " << OUTPUT_DIM << std::endl;
+    std::cout << "Mode: " << mode_name << std::endl;
+    std::cout << "Batch size: " << BATCH_SIZE << std::endl;
+    
+    std::cout << "Opening device connection..." << std::endl;
+    RT_CHECK(vx_dev_open(&device));
+    
+    // Generate batched input
+    std::vector<TYPE> h_input(BATCH_SIZE * INPUT_DIM);
+    for (auto& v : h_input) {
+        v = static_cast<TYPE>(rand()) / RAND_MAX;
+    }
+    
+    // Generate weights, biases, and optional sparse compression
+    std::vector<std::vector<TYPE>> h_weights_dense(NUM_LAYERS);
+    std::vector<std::vector<TYPE>> h_weights_compressed(NUM_LAYERS);
+    std::vector<std::vector<TYPE>> h_weights_logical(NUM_LAYERS);
+    std::vector<std::vector<TYPE>> h_biases(NUM_LAYERS);
+    std::vector<std::vector<uint8_t>> h_metadata(NUM_LAYERS);
+    
+    for (int layer = 0; layer < NUM_LAYERS; ++layer) {
+        int in_dim = layer_dims[layer].first;
+        int out_dim = layer_dims[layer].second;
+        
+        // Generate dense weights
+        h_weights_dense[layer].resize(in_dim * out_dim);
+        for (auto& v : h_weights_dense[layer]) {
+            v = (static_cast<TYPE>(rand()) / RAND_MAX - 0.5f) * 0.1f;
+        }
+        
+        h_biases[layer].resize(out_dim);
+        for (auto& v : h_biases[layer]) {
+            v = (static_cast<TYPE>(rand()) / RAND_MAX - 0.5f) * 0.1f;
+        }
+        
+        // Compress weights for sparse modes
+        if (mlp_mode == MLP_MODE_UGEMM) {
+            compress_2_4_sparse(h_weights_dense[layer], in_dim, out_dim,
+                               h_weights_compressed[layer], h_metadata[layer],
+                               h_weights_logical[layer]);
+        } else if (mlp_mode == MLP_MODE_VGEMM) {
+            compress_1_4_sparse(h_weights_dense[layer], in_dim, out_dim,
+                               h_weights_compressed[layer], h_metadata[layer],
+                               h_weights_logical[layer]);
+        } else {
+            h_weights_logical[layer] = h_weights_dense[layer];
+        }
+    }
+    
+    // Allocate device memory
+    std::cout << "Allocating device memory..." << std::endl;
+    
+    RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * INPUT_DIM * sizeof(TYPE), VX_MEM_READ, &input_buffer));
+    RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE), VX_MEM_READ_WRITE, &output_buffer));
+    
+    int max_hidden = std::max({HIDDEN1_DIM, HIDDEN2_DIM, HIDDEN3_DIM});
+    RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer1));
+    RT_CHECK(vx_mem_alloc(device, BATCH_SIZE * max_hidden * sizeof(TYPE), VX_MEM_READ_WRITE, &buffer2));
+    
+    std::vector<layer_config_t> layer_configs(NUM_LAYERS);
+    weight_buffers.resize(NUM_LAYERS);
+    bias_buffers.resize(NUM_LAYERS);
+    metadata_buffers.resize(NUM_LAYERS, nullptr);
+    
+    for (int layer = 0; layer < NUM_LAYERS; ++layer) {
+        int in_dim = layer_dims[layer].first;
+        int out_dim = layer_dims[layer].second;
+        
+        layer_configs[layer].input_dim = in_dim;
+        layer_configs[layer].output_dim = out_dim;
+        
+        // Upload weights (compressed for sparse modes, dense for TGEMM)
+        std::vector<TYPE>& weights_to_upload = 
+            (mlp_mode == MLP_MODE_TGEMM) ? h_weights_dense[layer] : h_weights_compressed[layer];
+        
+        size_t weight_size = weights_to_upload.size() * sizeof(TYPE);
+        RT_CHECK(vx_mem_alloc(device, weight_size, VX_MEM_READ, &weight_buffers[layer]));
+        RT_CHECK(vx_mem_address(weight_buffers[layer], &layer_configs[layer].weights_addr));
+        RT_CHECK(vx_copy_to_dev(weight_buffers[layer], weights_to_upload.data(), 0, weight_size));
+        
+        // Bias
+        size_t bias_size = h_biases[layer].size() * sizeof(TYPE);
+        RT_CHECK(vx_mem_alloc(device, bias_size, VX_MEM_READ, &bias_buffers[layer]));
+        RT_CHECK(vx_mem_address(bias_buffers[layer], &layer_configs[layer].bias_addr));
+        RT_CHECK(vx_copy_to_dev(bias_buffers[layer], h_biases[layer].data(), 0, bias_size));
+        
+        // Metadata for sparse modes
+        if (mlp_mode != MLP_MODE_TGEMM && !h_metadata[layer].empty()) {
+            size_t meta_size = h_metadata[layer].size();
+            RT_CHECK(vx_mem_alloc(device, meta_size, VX_MEM_READ, &metadata_buffers[layer]));
+            RT_CHECK(vx_mem_address(metadata_buffers[layer], &layer_configs[layer].metadata_addr));
+            RT_CHECK(vx_copy_to_dev(metadata_buffers[layer], h_metadata[layer].data(), 0, meta_size));
+        } else {
+            layer_configs[layer].metadata_addr = 0;
+        }
+    }
+    
+    RT_CHECK(vx_mem_alloc(device, layer_configs.size() * sizeof(layer_config_t), VX_MEM_READ, &layer_configs_buffer));
+    RT_CHECK(vx_copy_to_dev(layer_configs_buffer, layer_configs.data(), 0, layer_configs.size() * sizeof(layer_config_t)));
+    
+    std::cout << "Uploading input data (" << BATCH_SIZE << " samples)..." << std::endl;
+    RT_CHECK(vx_copy_to_dev(input_buffer, h_input.data(), 0, BATCH_SIZE * INPUT_DIM * sizeof(TYPE)));
+    
+    std::cout << "Uploading kernel binary..." << std::endl;
+    RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
+    
+    kernel_arg_t kernel_arg = {};
+    kernel_arg.num_layers = NUM_LAYERS;
+    kernel_arg.mode = mlp_mode;
+    RT_CHECK(vx_mem_address(input_buffer, &kernel_arg.input_addr));
+    RT_CHECK(vx_mem_address(output_buffer, &kernel_arg.output_addr));
+    RT_CHECK(vx_mem_address(layer_configs_buffer, &kernel_arg.layer_configs_addr));
+    RT_CHECK(vx_mem_address(buffer1, &kernel_arg.buffer1_addr));
+    RT_CHECK(vx_mem_address(buffer2, &kernel_arg.buffer2_addr));
+    
+    std::cout << "Uploading kernel arguments..." << std::endl;
+    RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
+    
+    std::cout << "Starting VEGETA MLP inference on device..." << std::endl;
+    auto start = std::chrono::high_resolution_clock::now();
+    RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
+    RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    std::cout << "Elapsed time: " << duration.count() << " ms" << std::endl;
+    
+    std::cout << "Downloading output (" << BATCH_SIZE << " samples)..." << std::endl;
+    std::vector<TYPE> h_output(BATCH_SIZE * OUTPUT_DIM);
+    RT_CHECK(vx_copy_from_dev(h_output.data(), output_buffer, 0, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE)));
+    
+    // CPU reference uses logical weights (with zeros for pruned elements)
+    std::cout << "Computing CPU reference..." << std::endl;
+    std::vector<TYPE> h_ref;
+    mlp_cpu_batch(h_input, h_weights_logical, h_biases, layer_dims, h_ref);
+    
+    std::cout << "Verifying results..." << std::endl;
+    int errors = 0;
+    for (int b = 0; b < BATCH_SIZE; ++b) {
+        for (int i = 0; i < OUTPUT_DIM; ++i) {
+            compare_float(h_output[b * OUTPUT_DIM + i], h_ref[b * OUTPUT_DIM + i], b, i, errors);
+        }
+    }
+    
+    std::cout << "Cleaning up..." << std::endl;
+    cleanup();
+    
+    if (errors != 0) {
+        std::cout << "Found " << errors << " errors!" << std::endl;
+        std::cout << "FAILED!" << std::endl;
+        return 1;
+    }
+    
+    std::cout << "PASSED!" << std::endl;
+    return 0;
+}

From 3f9e97b2e557058313c8503e9407ed46d6df60d4 Mon Sep 17 00:00:00 2001
From: tandonmitul27 <tandonmitul27>
Date: Thu, 5 Feb 2026 21:43:32 +0530
Subject: [PATCH 2/3] MLP test corrections

---
 tests/regression/mlp/common.h          |   2 +
 tests/regression/mlp/kernel.cpp        |  24 ++---
 tests/regression/mlp_vegeta/kernel.cpp |  93 ++++++++++++--------
 tests/regression/mlp_vegeta/main.cpp   | 117 +++++++++++++++++++------
 4 files changed, 164 insertions(+), 72 deletions(-)

diff --git a/tests/regression/mlp/common.h b/tests/regression/mlp/common.h
index 7cc37012a..476f234b0 100644
--- a/tests/regression/mlp/common.h
+++ b/tests/regression/mlp/common.h
@@ -1,6 +1,8 @@
 #ifndef _COMMON_H_
 #define _COMMON_H_
 
+#include <stdint.h>
+
 #ifndef TYPE
 #define TYPE float
 #endif
diff --git a/tests/regression/mlp/kernel.cpp b/tests/regression/mlp/kernel.cpp
index 50e4ed24e..92ead2ec1 100644
--- a/tests/regression/mlp/kernel.cpp
+++ b/tests/regression/mlp/kernel.cpp
@@ -48,23 +48,23 @@ void softmax_find_max_thread(softmax_args_t* __UNIFORM__ args) {
     }
 }
 
-// Thread function: compute exp in parallel, thread 0 sums
-void softmax_exp_sum_thread(softmax_args_t* __UNIFORM__ args) {
+// Thread function: compute exp in parallel
+void softmax_exp_thread(softmax_args_t* __UNIFORM__ args) {
     uint32_t i = blockIdx.x;
     
     // All threads compute their exp value
     if (i < args->size) {
         args->data[i] = expf(args->data[i] - *(args->max_val));
     }
-    
-    // Thread 0 does the sum reduction serially
-    if (i == 0) {
-        TYPE sum_exp = 0.0f;
-        for (uint32_t j = 0; j < args->size; ++j) {
-            sum_exp += args->data[j];
-        }
-        *(args->sum_exp) = sum_exp;
+}
+
+// Thread function: single thread sums all exp values
+void softmax_sum_thread(softmax_args_t* __UNIFORM__ args) {
+    TYPE sum_exp = 0.0f;
+    for (uint32_t j = 0; j < args->size; ++j) {
+        sum_exp += args->data[j];
     }
+    *(args->sum_exp) = sum_exp;
 }
 
 // Thread function: normalize in parallel
@@ -86,7 +86,9 @@ void apply_softmax(TYPE* data, uint32_t size) {
     uint32_t single_thread = 1;
     vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_find_max_thread, &args);
 
-    vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_sum_thread, &args);
+    vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_exp_thread, &args);
+    
+    vx_spawn_threads(1, &single_thread, nullptr, (vx_kernel_func_cb)softmax_sum_thread, &args);
     
     vx_spawn_threads(1, &size, nullptr, (vx_kernel_func_cb)softmax_normalize_thread, &args);
 }
diff --git a/tests/regression/mlp_vegeta/kernel.cpp b/tests/regression/mlp_vegeta/kernel.cpp
index ad57212f0..ea516c616 100644
--- a/tests/regression/mlp_vegeta/kernel.cpp
+++ b/tests/regression/mlp_vegeta/kernel.cpp
@@ -6,21 +6,32 @@
 // Aligned tile buffer for intermediate results 
 static TYPE g_C_tile[TILE_SIZE * TILE_SIZE] __attribute__((aligned(64)));
 
-// ReLU activation (in-place) for batched data
+// ReLU activation (in-place) for tile-major batched data
 inline void apply_relu_batch(TYPE* data, uint32_t batch_size, uint32_t dim) {
-    for (uint32_t b = 0; b < batch_size; ++b) {
-        for (uint32_t i = 0; i < dim; ++i) {
-            TYPE& v = data[b * dim + i];
-            if (v < 0) v = 0;
-        }
+    // Data is in tile-major format: process all elements
+    uint32_t total_elements = batch_size * dim;
+    for (uint32_t i = 0; i < total_elements; ++i) {
+        if (data[i] < 0) data[i] = 0;
     }
 }
 
-// Softmax for each sample in batch
+// Softmax for each sample in batch (tile-major layout)
 void apply_softmax_batch(TYPE* data, uint32_t batch_size, uint32_t dim) {
+    // Convert tile-major to row-major for softmax processing
+    // Since dim is OUTPUT_DIM=16 which is one tile, data layout is simple
+    int num_tile_cols = dim / TILE_SIZE;
+    
     for (uint32_t b = 0; b < batch_size; ++b) {
-        TYPE* sample = data + b * dim;
+        // Extract this sample's values across tiles
+        TYPE sample[OUTPUT_DIM];
+        for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) {
+            for (int col = 0; col < TILE_SIZE; ++col) {
+                int tile_idx = tile_col * (batch_size * TILE_SIZE) + b * TILE_SIZE + col;
+                sample[tile_col * TILE_SIZE + col] = data[tile_idx];
+            }
+        }
         
+        // Compute softmax
         TYPE max_val = sample[0];
         for (uint32_t i = 1; i < dim; ++i) {
             if (sample[i] > max_val) max_val = sample[i];
@@ -35,6 +46,14 @@ void apply_softmax_batch(TYPE* data, uint32_t batch_size, uint32_t dim) {
         for (uint32_t i = 0; i < dim; ++i) {
             sample[i] /= sum_exp;
         }
+        
+        // Write back to tile-major
+        for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) {
+            for (int col = 0; col < TILE_SIZE; ++col) {
+                int tile_idx = tile_col * (batch_size * TILE_SIZE) + b * TILE_SIZE + col;
+                data[tile_idx] = sample[tile_col * TILE_SIZE + col];
+            }
+        }
     }
 }
 
@@ -67,30 +86,31 @@ void vegeta_layer_tgemm_batch(
         clear_C_tile();
         
         for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) {
-            TYPE* A_tile = input + in_t * TILE_SIZE;
+            // Tile-major layout: each tile column stores BATCH_SIZE×TILE_SIZE contiguously
+            TYPE* A_tile = input + in_t * (BATCH_SIZE * TILE_SIZE);
             TYPE* B_tile = weights + in_t * TILE_SIZE * out_dim + out_t * TILE_SIZE;
             vegeta_tgemm_tile_accumulate(A_tile, B_tile, g_C_tile);
         }
         
-        for (uint32_t row = 0; row < BATCH_SIZE; ++row) {
-            for (uint32_t col = 0; col < TILE_SIZE; ++col) {
-                output[row * out_dim + out_t * TILE_SIZE + col] = 
-                    g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col];
-            }
+        // Store result in tile-major format
+        for (uint32_t i = 0; i < BATCH_SIZE * TILE_SIZE; ++i) {
+            output[out_t * (BATCH_SIZE * TILE_SIZE) + i] = 
+                g_C_tile[i] + bias[out_t * TILE_SIZE + (i % TILE_SIZE)];
         }
     }
 }
 
 // =============================================================================
-// UGEMM: Dense × 2:4 Sparse (T × U → T)
+// UGEMM: Dense × 2:4 Sparse Weights (T × U → T)
 // For 2:4 sparsity: weights are compressed to half size, metadata indicates positions
+// T-reg contains sparse weights (with metadata), U-reg contains dense activations
 // =============================================================================
 void vegeta_ugemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) {
     vx_lt(0, (size_t)C_tile, 0);  // Load C to T0 (accumulator)
-    vx_lt(1, (size_t)A_tile, 0);  // Load A to T1
-    vx_lm(1, (size_t)M_tile, 0);  // Load metadata to M1
-    vx_lu(2, (size_t)B_tile, 0);  // Load B (2KB) to U2
-    vx_ugemm(0, 1, 2);             // T0 += T1 × U2 (with M1 metadata)
+    vx_lt(1, (size_t)B_tile, 0);  // Load sparse weights B to T1
+    vx_lm(1, (size_t)M_tile, 0);  // Load metadata for weights into M1
+    vx_lu(2, (size_t)A_tile, 0);  // Load dense activations A to U2
+    vx_ugemm(0, 1, 2);             // T0 += T1(sparse weights) × U2(dense activations) with M1 metadata
     vx_st((size_t)C_tile, 0, 0);  // Store result
 }
 
@@ -105,7 +125,8 @@ void vegeta_layer_ugemm_batch(
         clear_C_tile();
         
         for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) {
-            TYPE* A_tile = input + in_t * TILE_SIZE;
+            // Tile-major input: U-reg loads 2×TILE_SIZE worth of activation data
+            TYPE* A_tile = input + in_t * (BATCH_SIZE * TILE_SIZE);
             // Compressed weights: half the K dimension
             TYPE* B_tile = weights + (in_t * TILE_SIZE / 2) * out_dim + out_t * TILE_SIZE;
             // Metadata: 128 bytes per tile
@@ -113,25 +134,25 @@ void vegeta_layer_ugemm_batch(
             vegeta_ugemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile);
         }
         
-        for (uint32_t row = 0; row < BATCH_SIZE; ++row) {
-            for (uint32_t col = 0; col < TILE_SIZE; ++col) {
-                output[row * out_dim + out_t * TILE_SIZE + col] = 
-                    g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col];
-            }
+        // Store result in tile-major format
+        for (uint32_t i = 0; i < BATCH_SIZE * TILE_SIZE; ++i) {
+            output[out_t * (BATCH_SIZE * TILE_SIZE) + i] = 
+                g_C_tile[i] + bias[out_t * TILE_SIZE + (i % TILE_SIZE)];
         }
     }
 }
 
 // =============================================================================
-// VGEMM: Dense × 1:4 Sparse (T × V → T)
+// VGEMM: Dense × 1:4 Sparse Weights (T × V → T)
 // For 1:4 sparsity: weights are compressed to quarter size, metadata indicates positions
+// T-reg contains sparse weights (with metadata), V-reg contains dense activations
 // =============================================================================
 void vegeta_vgemm_tile_accumulate(TYPE* A_tile, TYPE* B_tile, uint8_t* M_tile, TYPE* C_tile) {
     vx_lt(0, (size_t)C_tile, 0);  // Load C to T0 (accumulator)
-    vx_lt(1, (size_t)A_tile, 0);  // Load A to T1
-    vx_lm(1, (size_t)M_tile, 0);  // Load metadata to M1
-    vx_lv(1, (size_t)B_tile, 0);  // Load B (4KB) to V1
-    vx_vgemm(0, 1, 1);             // T0 += T1 × V1 (with M1 metadata)
+    vx_lt(1, (size_t)B_tile, 0);  // Load compressed sparse weights B to T1
+    vx_lm(1, (size_t)M_tile, 0);  // Load metadata for sparse weights to M1
+    vx_lv(1, (size_t)A_tile, 0);  // Load dense activations A to V1
+    vx_vgemm(0, 1, 1);             // T0 += T1(sparse weights) × V1(dense activations) with M1 metadata
     vx_st((size_t)C_tile, 0, 0);  // Store result
 }
 
@@ -146,7 +167,8 @@ void vegeta_layer_vgemm_batch(
         clear_C_tile();
         
         for (uint32_t in_t = 0; in_t < in_tiles; ++in_t) {
-            TYPE* A_tile = input + in_t * TILE_SIZE;
+            // Tile-major input: V-reg loads 4×TILE_SIZE worth of activation data
+            TYPE* A_tile = input + in_t * (BATCH_SIZE * TILE_SIZE);
             // Compressed weights: quarter the K dimension
             TYPE* B_tile = weights + (in_t * TILE_SIZE / 4) * out_dim + out_t * TILE_SIZE;
             // Metadata: 128 bytes per tile
@@ -154,11 +176,10 @@ void vegeta_layer_vgemm_batch(
             vegeta_vgemm_tile_accumulate(A_tile, B_tile, M_tile, g_C_tile);
         }
         
-        for (uint32_t row = 0; row < BATCH_SIZE; ++row) {
-            for (uint32_t col = 0; col < TILE_SIZE; ++col) {
-                output[row * out_dim + out_t * TILE_SIZE + col] = 
-                    g_C_tile[row * TILE_SIZE + col] + bias[out_t * TILE_SIZE + col];
-            }
+        // Store result in tile-major format
+        for (uint32_t i = 0; i < BATCH_SIZE * TILE_SIZE; ++i) {
+            output[out_t * (BATCH_SIZE * TILE_SIZE) + i] = 
+                g_C_tile[i] + bias[out_t * TILE_SIZE + (i % TILE_SIZE)];
         }
     }
 }
diff --git a/tests/regression/mlp_vegeta/main.cpp b/tests/regression/mlp_vegeta/main.cpp
index a801b854c..ac9e4cf3b 100644
--- a/tests/regression/mlp_vegeta/main.cpp
+++ b/tests/regression/mlp_vegeta/main.cpp
@@ -84,6 +84,47 @@ void cleanup() {
     }
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// Tile-Major Layout Conversion
+///////////////////////////////////////////////////////////////////////////////
+
+// Convert row-major [batch × dim] to tile-major layout for VEGETA loads
+// Input: row_major[batch × dim] where batch=16, dim is multiple of 16
+// Output: tile_major stores data as contiguous 16×16 tiles
+// Memory layout: For each tile column, store all 16 rows contiguously
+void convert_to_tile_major(const std::vector<TYPE>& row_major, int batch, int dim,
+                           std::vector<TYPE>& tile_major) {
+    int num_tile_cols = dim / TILE_SIZE;
+    tile_major.resize(batch * dim);
+    
+    for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) {
+        for (int row = 0; row < batch; ++row) {
+            for (int col = 0; col < TILE_SIZE; ++col) {
+                int src_idx = row * dim + tile_col * TILE_SIZE + col;
+                int dst_idx = tile_col * (batch * TILE_SIZE) + row * TILE_SIZE + col;
+                tile_major[dst_idx] = row_major[src_idx];
+            }
+        }
+    }
+}
+
+// Convert tile-major back to row-major [batch × dim]
+void convert_from_tile_major(const std::vector<TYPE>& tile_major, int batch, int dim,
+                             std::vector<TYPE>& row_major) {
+    int num_tile_cols = dim / TILE_SIZE;
+    row_major.resize(batch * dim);
+    
+    for (int tile_col = 0; tile_col < num_tile_cols; ++tile_col) {
+        for (int row = 0; row < batch; ++row) {
+            for (int col = 0; col < TILE_SIZE; ++col) {
+                int src_idx = tile_col * (batch * TILE_SIZE) + row * TILE_SIZE + col;
+                int dst_idx = row * dim + tile_col * TILE_SIZE + col;
+                row_major[dst_idx] = tile_major[src_idx];
+            }
+        }
+    }
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Sparse Weight Compression
 ///////////////////////////////////////////////////////////////////////////////
@@ -115,25 +156,38 @@ void compress_2_4_sparse(const std::vector<TYPE>& dense, int in_dim, int out_dim
             // Sort by magnitude (descending)
             std::sort(vals, vals + 4, [](auto& a, auto& b) { return a.first > b.first; });
             
-            // Keep top 2, generate mask
+            // Create bitmask for top 2 values
             uint8_t mask = 0;
             for (int k = 0; k < 2; ++k) {
                 int pos = vals[k].second;
                 mask |= (1 << pos);
-                compressed[(comp_idx / 2) * out_dim + j] = dense[(i + pos) * out_dim + j];
-                logical[(i + pos) * out_dim + j] = dense[(i + pos) * out_dim + j];
-                comp_idx++;
             }
             
-            // Store metadata (simplified - in real impl would be tile-based)
+            // Store compressed values in POSITION ORDER (not magnitude order)
+            // Hardware iterates through bit positions 0-3 and expects values in that order
+            for (int pos = 0; pos < 4; ++pos) {
+                if (mask & (1 << pos)) {
+                    compressed[comp_idx * out_dim + j] = dense[(i + pos) * out_dim + j];
+                    logical[(i + pos) * out_dim + j] = dense[(i + pos) * out_dim + j];
+                    comp_idx++;
+                }
+            }
+            
+            // Store metadata in 16×16 nibble format (128 bytes per tile)
+            // Each row has 8 bytes, each byte has 2 nibbles
+            // Byte layout per row: byte 0 = cols 0,1; byte 1 = cols 2,3; ...; byte 7 = cols 14,15
             int tile_in = i / TILE_SIZE;
             int tile_out = j / TILE_SIZE;
             int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES;
-            int row_in_tile = (i % TILE_SIZE) / 4;
-            int col_in_tile = j % TILE_SIZE;
-            int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile;
-            if (byte_offset < (int)metadata.size()) {
-                metadata[byte_offset] = mask;
+            int row_in_tile = i % TILE_SIZE;
+            int k_grp = (i % TILE_SIZE) / 4;  // Which group of 4 in this row
+            int byte_idx = meta_offset + row_in_tile * 8 + k_grp / 2;
+            if (byte_idx < (int)metadata.size()) {
+                if (k_grp % 2 == 0) {
+                    metadata[byte_idx] = (mask << 4);  // Upper nibble
+                } else {
+                    metadata[byte_idx] |= mask;  // Lower nibble
+                }
             }
         }
     }
@@ -168,16 +222,21 @@ void compress_1_4_sparse(const std::vector<TYPE>& dense, int in_dim, int out_dim
             logical[(i + max_pos) * out_dim + j] = dense[(i + max_pos) * out_dim + j];
             comp_idx++;
             
-            // Store metadata
+            // Store metadata in 16×16 nibble format (128 bytes per tile)
+            // Each row has 8 bytes, each byte has 2 nibbles
             uint8_t mask = (1 << max_pos);
             int tile_in = i / TILE_SIZE;
             int tile_out = j / TILE_SIZE;
             int meta_offset = (tile_in * out_tiles + tile_out) * M_TILE_BYTES;
-            int row_in_tile = (i % TILE_SIZE) / 4;
-            int col_in_tile = j % TILE_SIZE;
-            int byte_offset = meta_offset + row_in_tile * TILE_SIZE + col_in_tile;
-            if (byte_offset < (int)metadata.size()) {
-                metadata[byte_offset] = mask;
+            int row_in_tile = i % TILE_SIZE;
+            int k_grp = (i % TILE_SIZE) / 4;  // Which group of 4 in this row
+            int byte_idx = meta_offset + row_in_tile * 8 + k_grp / 2;
+            if (byte_idx < (int)metadata.size()) {
+                if (k_grp % 2 == 0) {
+                    metadata[byte_idx] = (mask << 4);  // Upper nibble
+                } else {
+                    metadata[byte_idx] |= mask;  // Lower nibble
+                }
             }
         }
     }
@@ -301,12 +360,16 @@ int main(int argc, char *argv[]) {
     std::cout << "Opening device connection..." << std::endl;
     RT_CHECK(vx_dev_open(&device));
     
-    // Generate batched input
-    std::vector<TYPE> h_input(BATCH_SIZE * INPUT_DIM);
-    for (auto& v : h_input) {
+    // Generate batched input (row-major)
+    std::vector<TYPE> h_input_row_major(BATCH_SIZE * INPUT_DIM);
+    for (auto& v : h_input_row_major) {
         v = static_cast<TYPE>(rand()) / RAND_MAX;
     }
     
+    // Convert to tile-major for device
+    std::vector<TYPE> h_input_tile_major;
+    convert_to_tile_major(h_input_row_major, BATCH_SIZE, INPUT_DIM, h_input_tile_major);
+    
     // Generate weights, biases, and optional sparse compression
     std::vector<std::vector<TYPE>> h_weights_dense(NUM_LAYERS);
     std::vector<std::vector<TYPE>> h_weights_compressed(NUM_LAYERS);
@@ -394,8 +457,8 @@ int main(int argc, char *argv[]) {
     RT_CHECK(vx_mem_alloc(device, layer_configs.size() * sizeof(layer_config_t), VX_MEM_READ, &layer_configs_buffer));
     RT_CHECK(vx_copy_to_dev(layer_configs_buffer, layer_configs.data(), 0, layer_configs.size() * sizeof(layer_config_t)));
     
-    std::cout << "Uploading input data (" << BATCH_SIZE << " samples)..." << std::endl;
-    RT_CHECK(vx_copy_to_dev(input_buffer, h_input.data(), 0, BATCH_SIZE * INPUT_DIM * sizeof(TYPE)));
+    std::cout << "Uploading input data (" << BATCH_SIZE << " samples, tile-major)..." << std::endl;
+    RT_CHECK(vx_copy_to_dev(input_buffer, h_input_tile_major.data(), 0, BATCH_SIZE * INPUT_DIM * sizeof(TYPE)));
     
     std::cout << "Uploading kernel binary..." << std::endl;
     RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
@@ -420,14 +483,18 @@ int main(int argc, char *argv[]) {
     auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
     std::cout << "Elapsed time: " << duration.count() << " ms" << std::endl;
     
-    std::cout << "Downloading output (" << BATCH_SIZE << " samples)..." << std::endl;
-    std::vector<TYPE> h_output(BATCH_SIZE * OUTPUT_DIM);
-    RT_CHECK(vx_copy_from_dev(h_output.data(), output_buffer, 0, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE)));
+    std::cout << "Downloading output (" << BATCH_SIZE << " samples, tile-major)..." << std::endl;
+    std::vector<TYPE> h_output_tile_major(BATCH_SIZE * OUTPUT_DIM);
+    RT_CHECK(vx_copy_from_dev(h_output_tile_major.data(), output_buffer, 0, BATCH_SIZE * OUTPUT_DIM * sizeof(TYPE)));
+    
+    // Convert output back to row-major for comparison
+    std::vector<TYPE> h_output;
+    convert_from_tile_major(h_output_tile_major, BATCH_SIZE, OUTPUT_DIM, h_output);
     
     // CPU reference uses logical weights (with zeros for pruned elements)
     std::cout << "Computing CPU reference..." << std::endl;
     std::vector<TYPE> h_ref;
-    mlp_cpu_batch(h_input, h_weights_logical, h_biases, layer_dims, h_ref);
+    mlp_cpu_batch(h_input_row_major, h_weights_logical, h_biases, layer_dims, h_ref);
     
     std::cout << "Verifying results..." << std::endl;
     int errors = 0;

From abd48054063a341eb5424525aea69344d916e4ca Mon Sep 17 00:00:00 2001
From: tandonmitul27 <tandonmitul27>
Date: Sun, 8 Feb 2026 20:04:33 +0530
Subject: [PATCH 3/3] Added a separate LSU for Vegeta Engine

---
 sim/simx/Makefile        |   7 +-
 sim/simx/core.cpp        |   1 +
 sim/simx/core.h          |   6 ++
 sim/simx/execute.cpp     |   5 --
 sim/simx/sparse_unit.cpp | 145 ++++++++++++++++++++++-----------------
 sim/simx/vegeta_lsu.cpp  | 134 ++++++++++++++++++++++++++++++++++++
 sim/simx/vegeta_lsu.h    | 102 +++++++++++++++++++++++++++
 7 files changed, 330 insertions(+), 70 deletions(-)
 create mode 100644 sim/simx/vegeta_lsu.cpp
 create mode 100644 sim/simx/vegeta_lsu.h

diff --git a/sim/simx/Makefile b/sim/simx/Makefile
index bbb800612..43267ed74 100644
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -27,8 +27,6 @@ SRCS += $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp
 SRCS += $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp
 SRCS += $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp
 
-# sparse unit; add -DEXT_SPARSE_ENABLE flag later
-SRCS += $(SRC_DIR)/sparse_unit.cpp
 
 # Add V extension sources
 ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
@@ -42,6 +40,11 @@ endif
 ifneq ($(findstring -DEXT_TCU_ENABLE, $(CONFIGS)),)
 	SRCS += $(SRC_DIR)/tensor_unit.cpp
 endif
+# Add VEGETA extension sources
+ifneq ($(findstring -DEXT_VEGETA_ENABLE, $(CONFIGS)),)
+	SRCS += $(SRC_DIR)/vegeta_lsu.cpp
+	SRCS += $(SRC_DIR)/sparse_unit.cpp
+endif
 
 # Debugging
 ifdef DEBUG
diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp
index 49617318e..ba3644efd 100644
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@@ -64,6 +64,7 @@ Core::Core(const SimContext& ctx,
 #endif
 #ifdef EXT_VEGETA_ENABLE
   , sparse_unit_(SparseUnit::Create("spu", arch, this))
+  , vegeta_lsu_(VegetaLsu::Create("vegeta_lsu", this, 1))
 #endif
   , emulator_(arch, dcrs, this)
   , ibuffers_(arch.num_warps(), IBUF_SIZE)
diff --git a/sim/simx/core.h b/sim/simx/core.h
index dcc675a51..30d96296a 100644
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -31,6 +31,7 @@
 #endif
 #ifdef EXT_VEGETA_ENABLE
 #include "sparse_unit.h"
+#include "vegeta_lsu.h"
 #endif
 
 #include "dispatcher.h"
@@ -184,6 +185,10 @@ class Core : public SimObject<Core> {
   SparseUnit::Ptr& sparse_unit() {
     return sparse_unit_;
   }
+
+  VegetaLsu::Ptr& vegeta_lsu() {
+    return vegeta_lsu_;
+  }
 #endif
 
   auto& trace_pool() {
@@ -217,6 +222,7 @@ class Core : public SimObject<Core> {
 
 #ifdef EXT_VEGETA_ENABLE
   SparseUnit::Ptr sparse_unit_;
+  VegetaLsu::Ptr vegeta_lsu_;
 #endif
 
   Emulator emulator_;
diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp
index 80c3132f9..7cb99d1fc 100644
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -1556,7 +1556,6 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
     case VegetaTcuType::TILE_GEMM_T: {
       auto trace_data = std::make_shared<SparseUnit::ExeTraceData>();
       trace->data = trace_data;
-      assert(warp.tmask.count() == num_threads);
       
       // Extract tile register indices from instruction
       uint32_t dst_reg = rdest.idx;
@@ -1570,7 +1569,6 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
     case VegetaTcuType::TILE_GEMM_U: {
       auto trace_data = std::make_shared<SparseUnit::ExeTraceData>();
       trace->data = trace_data;
-      assert(warp.tmask.count() == num_threads);
       
       // Extract tile register indices from instruction
       uint32_t dst_reg = rdest.idx;
@@ -1585,7 +1583,6 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
     case VegetaTcuType::TILE_GEMM_V: {
       auto trace_data = std::make_shared<SparseUnit::ExeTraceData>();
       trace->data = trace_data;
-      assert(warp.tmask.count() == num_threads);
       
       // Extract tile register indices from instruction
       uint32_t dst_reg = rdest.idx;
@@ -1599,7 +1596,6 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
     case VegetaTcuType::TILE_GEMM_R: {
       auto trace_data = std::make_shared<SparseUnit::ExeTraceData>();
       trace->data = trace_data;
-      assert(warp.tmask.count() == num_threads);
       
       // Extract tile register indices from instruction
       uint32_t dst_reg = rdest.idx;
@@ -1614,7 +1610,6 @@ instr_trace_t* Emulator::execute(const Instr &instr, uint32_t wid) {
       auto tpuArgs = std::get<IntrVegetaTcuArgs>(instrArgs);
       auto trace_data = std::make_shared<SparseUnit::ExeTraceData>();
       trace->data = trace_data;
-      assert(warp.tmask.count() == num_threads);
       
       // Get metadata from integer registers a0-a7 (x10-x17) for sparse fragA
       // These contain metadata values loaded by mma_sync into a0-a7
diff --git a/sim/simx/sparse_unit.cpp b/sim/simx/sparse_unit.cpp
index 275146b23..2740c391e 100644
--- a/sim/simx/sparse_unit.cpp
+++ b/sim/simx/sparse_unit.cpp
@@ -16,6 +16,7 @@
 #include "sparse_cfg.h"
 #include <rvfloats.h>
 #include "core.h"
+#include "vegeta_lsu.h"
 #include <cstring>
 
 using namespace vortex;
@@ -772,90 +773,97 @@ class SparseUnit::Impl {
       uint32_t tile_reg_idx = vd;
       assert(tile_reg_idx < tile_reg_file_.size() && "Tile register index out of bounds");
       auto &tile_reg = tile_reg_file_[tile_reg_idx];
-      constexpr uint32_t ELEMENT_SIZE = sizeof(typename vt::fp32::dtype); // 4 bytes for fp32
       base_addr &= 0xFFFFFFFC; // Align to word boundary for fp32 loads
 
-      // Load tile from memory: 16 rows x 16 columns = 256 fp32 elements = 1024 bytes
+      // Use VegetaLsu for bulk tile load (1KB)
+      constexpr uint32_t T_TILE_SIZE = TILE_DIM * TILE_DIM * sizeof(float);
+      float tile_buffer[TILE_DIM * TILE_DIM];
+      core_->vegeta_lsu()->load_tile(base_addr, VegetaLsu::TileType::T_TILE, 
+                                      tile_reg_idx, wid, tid, tile_buffer);
+      
+      // Copy from linear buffer to 2D tile register
       for (uint32_t row = 0; row < TILE_DIM; ++row) {
         for (uint32_t col = 0; col < TILE_DIM; ++col) {
-          uint64_t mem_addr = base_addr + (row * TILE_DIM + col) * ELEMENT_SIZE;
-          uint32_t mem_data = 0;
-          core_->dcache_read(&mem_data, mem_addr, ELEMENT_SIZE);
-          trace_data->mem_addrs.at(tid).push_back({mem_addr, ELEMENT_SIZE});
-          
-          // Interpret as float and store in tile register
-          float value;
-          std::memcpy(&value, &mem_data, ELEMENT_SIZE);
-          tile_reg[row][col] = value;
+          tile_reg[row][col] = tile_buffer[row * TILE_DIM + col];
         }
       }
       
-      DP(2, "TILE_LOAD_T: wid=" << wid << ", tid=" << tid 
+      // Record trace for all elements
+      constexpr uint32_t ELEMENT_SIZE = sizeof(float);
+      for (uint32_t i = 0; i < TILE_DIM * TILE_DIM; ++i) {
+        trace_data->mem_addrs.at(tid).push_back({base_addr + i * ELEMENT_SIZE, ELEMENT_SIZE});
+      }
+      
+      DP(2, "TILE_LOAD_T (via VegetaLsu): wid=" << wid << ", tid=" << tid 
          << ", tile_reg_idx=" << tile_reg_idx << ", base_addr=0x" << std::hex << base_addr << std::dec);
       break;
     }
     case VegetaLsuType::TILE_LOAD_U: {
       // tile_load_u: DestReg contains ureg index, map to tile registers
-      // ureg 0 -> tile reg 0, 1
+      // ureg 0 -> tile reg 0, 1 (2KB total = 2 T-tiles)
       std::vector<uint32_t> target_tregs = map_ureg_to_treg(vd);
       base_addr &= 0xFFFFFFFC; // Align to word boundary for fp32 loads
-      constexpr uint32_t ELEMENT_SIZE = sizeof(typename vt::fp32::dtype);
       
-      uint64_t current_addr = base_addr;
-      for (uint32_t treg_idx : target_tregs) {
+      // Use VegetaLsu for bulk U-tile load (2KB)
+      constexpr uint32_t T_TILE_ELEMENTS = TILE_DIM * TILE_DIM;
+      float tile_buffer[T_TILE_ELEMENTS * 2]; // 2 T-tiles for U-reg
+      core_->vegeta_lsu()->load_tile(base_addr, VegetaLsu::TileType::U_TILE, 
+                                      vd, wid, tid, tile_buffer);
+      
+      // Copy from linear buffer to 2D tile registers
+      for (uint32_t t = 0; t < target_tregs.size(); ++t) {
+        uint32_t treg_idx = target_tregs[t];
         assert(treg_idx < tile_reg_file_.size() && "Tile register index out of bounds");
         auto &tile_reg = tile_reg_file_[treg_idx];
-        
-        // Load tile from memory: 16 rows x 16 columns = 256 fp32 elements = 1024 bytes
         for (uint32_t row = 0; row < TILE_DIM; ++row) {
           for (uint32_t col = 0; col < TILE_DIM; ++col) {
-            uint64_t mem_addr = current_addr + (row * TILE_DIM + col) * ELEMENT_SIZE;
-            uint32_t mem_data = 0;
-            core_->dcache_read(&mem_data, mem_addr, ELEMENT_SIZE);
-            trace_data->mem_addrs.at(tid).push_back({mem_addr, ELEMENT_SIZE});
-            
-            float value;
-            std::memcpy(&value, &mem_data, ELEMENT_SIZE);
-            tile_reg[row][col] = value;
+            tile_reg[row][col] = tile_buffer[t * T_TILE_ELEMENTS + row * TILE_DIM + col];
           }
         }
-        current_addr += TILE_DIM * TILE_DIM * ELEMENT_SIZE; // Move to next tile (1KB)
       }
       
-      DP(2, "TILE_LOAD_U: wid=" << wid << ", tid=" << tid 
+      // Record trace for all elements
+      constexpr uint32_t ELEMENT_SIZE = sizeof(float);
+      for (uint32_t i = 0; i < T_TILE_ELEMENTS * 2; ++i) {
+        trace_data->mem_addrs.at(tid).push_back({base_addr + i * ELEMENT_SIZE, ELEMENT_SIZE});
+      }
+      
+      DP(2, "TILE_LOAD_U (via VegetaLsu): wid=" << wid << ", tid=" << tid 
          << ", ureg_idx=" << vd << ", target_tregs=[" 
          << target_tregs[0] << ", " << target_tregs[1] << "], base_addr=0x" << std::hex << base_addr << std::dec);
       break;
     }
     case VegetaLsuType::TILE_LOAD_V: {
       // tile_load_v: DestReg contains vreg index, map to tile registers
-      // vreg 0 -> tile reg 0, 1, 2, 3
+      // vreg 0 -> tile reg 0, 1, 2, 3 (4KB total = 4 T-tiles)
       std::vector<uint32_t> target_tregs = map_vreg_to_treg(vd);
       base_addr &= 0xFFFFFFFC; // Align to word boundary for fp32 loads
-      constexpr uint32_t ELEMENT_SIZE = sizeof(typename vt::fp32::dtype);
       
-      uint64_t current_addr = base_addr;
-      for (uint32_t treg_idx : target_tregs) {
+      // Use VegetaLsu for bulk V-tile load (4KB)
+      constexpr uint32_t T_TILE_ELEMENTS = TILE_DIM * TILE_DIM;
+      float tile_buffer[T_TILE_ELEMENTS * 4]; // 4 T-tiles for V-reg
+      core_->vegeta_lsu()->load_tile(base_addr, VegetaLsu::TileType::V_TILE, 
+                                      vd, wid, tid, tile_buffer);
+      
+      // Copy from linear buffer to 2D tile registers
+      for (uint32_t t = 0; t < target_tregs.size(); ++t) {
+        uint32_t treg_idx = target_tregs[t];
         assert(treg_idx < tile_reg_file_.size() && "Tile register index out of bounds");
         auto &tile_reg = tile_reg_file_[treg_idx];
-        
-        // Load tile from memory: 16 rows x 16 columns = 256 fp32 elements = 1024 bytes
         for (uint32_t row = 0; row < TILE_DIM; ++row) {
           for (uint32_t col = 0; col < TILE_DIM; ++col) {
-            uint64_t mem_addr = current_addr + (row * TILE_DIM + col) * ELEMENT_SIZE;
-            uint32_t mem_data = 0;
-            core_->dcache_read(&mem_data, mem_addr, ELEMENT_SIZE);
-            trace_data->mem_addrs.at(tid).push_back({mem_addr, ELEMENT_SIZE});
-            
-            float value;
-            std::memcpy(&value, &mem_data, ELEMENT_SIZE);
-            tile_reg[row][col] = value;
+            tile_reg[row][col] = tile_buffer[t * T_TILE_ELEMENTS + row * TILE_DIM + col];
           }
         }
-        current_addr += TILE_DIM * TILE_DIM * ELEMENT_SIZE; // Move to next tile (1KB)
       }
       
-      DP(2, "TILE_LOAD_V: wid=" << wid << ", tid=" << tid 
+      // Record trace for all elements
+      constexpr uint32_t ELEMENT_SIZE = sizeof(float);
+      for (uint32_t i = 0; i < T_TILE_ELEMENTS * 4; ++i) {
+        trace_data->mem_addrs.at(tid).push_back({base_addr + i * ELEMENT_SIZE, ELEMENT_SIZE});
+      }
+      
+      DP(2, "TILE_LOAD_V (via VegetaLsu): wid=" << wid << ", tid=" << tid 
          << ", vreg_idx=" << vd << ", target_tregs=[" 
          << target_tregs[0] << ", " << target_tregs[1] << ", " 
          << target_tregs[2] << ", " << target_tregs[3] << "], base_addr=0x" << std::hex << base_addr << std::dec);
@@ -867,22 +875,28 @@ class SparseUnit::Impl {
       assert(meta_reg_idx < metadata_reg_file_.size() && "Metadata register index out of bounds");
       auto &metadata_reg = metadata_reg_file_[meta_reg_idx];
 
-      // Load metadata from memory: 16 rows x 16 columns = 256 uint4 elements = 128 bytes
+      // Use VegetaLsu for bulk M-tile load (128 bytes)
+      constexpr uint32_t M_TILE_SIZE = 128;
+      uint8_t meta_buffer[M_TILE_SIZE];
+      core_->vegeta_lsu()->load_tile(base_addr, VegetaLsu::TileType::M_TILE, 
+                                      meta_reg_idx, wid, tid, meta_buffer);
+      
+      // Parse nibbles from linear buffer into metadata register
       // Each byte stores two uint4 values: upper nibble for col N, lower nibble for col N+1
       for (uint32_t row = 0; row < TILE_DIM; ++row) {
         for (uint32_t col = 0; col < TILE_DIM; col += 2) {
-          uint64_t mem_addr = base_addr + (row * (TILE_DIM / 2) + col / 2);
-          uint8_t mem_data = 0;
-          core_->dcache_read(&mem_data, mem_addr, 1);
-          trace_data->mem_addrs.at(tid).push_back({mem_addr, 1});
-          
-          // Upper nibble for col N, lower nibble for col N+1
-          metadata_reg[row][col] = (mem_data >> 4) & 0x0F;
-          metadata_reg[row][col + 1] = mem_data & 0x0F;
+          uint8_t byte = meta_buffer[row * (TILE_DIM / 2) + col / 2];
+          metadata_reg[row][col] = (byte >> 4) & 0x0F;
+          metadata_reg[row][col + 1] = byte & 0x0F;
         }
       }
+      
+      // Record trace for all bytes
+      for (uint32_t i = 0; i < M_TILE_SIZE; ++i) {
+        trace_data->mem_addrs.at(tid).push_back({base_addr + i, 1});
+      }
 
-      DP(2, "TILE_LOAD_M: wid=" << wid << ", tid=" << tid 
+      DP(2, "TILE_LOAD_M (via VegetaLsu): wid=" << wid << ", tid=" << tid 
          << ", metadata_reg_idx=" << meta_reg_idx << ", base_addr=0x" << std::hex << base_addr << std::dec);
       break;
     }
@@ -911,21 +925,26 @@ class SparseUnit::Impl {
     assert(vs3 < tile_reg_file_.size() && "Tile register index out of bounds");
     auto &tile_reg = tile_reg_file_[vs3];
     constexpr uint32_t TILE_DIM = 16;
-    constexpr uint32_t ELEMENT_SIZE = sizeof(typename vt::fp32::dtype); // 4 bytes for fp32
 
-    // Store tile to memory: 16 rows x 16 columns = 256 fp32 elements = 1024 bytes
+    // Copy 2D tile register to linear buffer for VegetaLsu
+    float tile_buffer[TILE_DIM * TILE_DIM];
     for (uint32_t row = 0; row < TILE_DIM; ++row) {
       for (uint32_t col = 0; col < TILE_DIM; ++col) {
-        uint64_t mem_addr = base_addr + (row * TILE_DIM + col) * ELEMENT_SIZE;
-        float value = tile_reg[row][col];
-        uint32_t mem_data = 0;
-        std::memcpy(&mem_data, &value, ELEMENT_SIZE);
-        core_->dcache_write(&mem_data, mem_addr, ELEMENT_SIZE);
-        trace_data->mem_addrs.at(tid).push_back({mem_addr, ELEMENT_SIZE});
+        tile_buffer[row * TILE_DIM + col] = tile_reg[row][col];
       }
     }
+    
+    // Use VegetaLsu for bulk tile store (1KB)
+    core_->vegeta_lsu()->store_tile(base_addr, VegetaLsu::TileType::T_TILE, 
+                                     vs3, wid, tid, tile_buffer);
+    
+    // Record trace for all elements
+    constexpr uint32_t ELEMENT_SIZE = sizeof(float);
+    for (uint32_t i = 0; i < TILE_DIM * TILE_DIM; ++i) {
+      trace_data->mem_addrs.at(tid).push_back({base_addr + i * ELEMENT_SIZE, ELEMENT_SIZE});
+    }
 
-    DP(2, "TILE_STORE: wid=" << wid << ", tid=" << tid << ", vs3=" << vs3 
+    DP(2, "TILE_STORE (via VegetaLsu): wid=" << wid << ", tid=" << tid << ", vs3=" << vs3 
        << ", base_addr=0x" << std::hex << base_addr << std::dec);
     #else
     std::abort(); // EXT_VEGETA_ENABLE required for store operations
diff --git a/sim/simx/vegeta_lsu.cpp b/sim/simx/vegeta_lsu.cpp
new file mode 100644
index 000000000..7bc782b0e
--- /dev/null
+++ b/sim/simx/vegeta_lsu.cpp
@@ -0,0 +1,134 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "vegeta_lsu.h"
+#include "core.h"
+#include "debug.h"
+#include <cstring>
+
+using namespace vortex;
+
+// Tile sizes in bytes
+static constexpr uint32_t T_TILE_SIZE = 1024;  // 16x16 x 4 bytes
+static constexpr uint32_t U_TILE_SIZE = 2048;  // 2 T-tiles
+static constexpr uint32_t V_TILE_SIZE = 4096;  // 4 T-tiles
+static constexpr uint32_t M_TILE_SIZE = 128;   // 16x16 x 0.5 bytes (nibbles)
+
+// Cache line size (from config)
+static constexpr uint32_t CACHE_LINE_SIZE = 64;
+
+VegetaLsu::VegetaLsu(const SimContext& ctx, const char* name, Core* core, uint32_t num_ports)
+  : SimObject(ctx, name)
+  , dcache_req_ports(num_ports, this)
+  , dcache_rsp_ports(num_ports, this)
+  , core_(core)
+  , num_ports_(num_ports)
+  , perf_stats_()
+{}
+
+VegetaLsu::~VegetaLsu() {}
+
+void VegetaLsu::reset() {
+  perf_stats_ = PerfStats();
+}
+
+void VegetaLsu::tick() {
+  // Process cache responses (for future async implementation)
+  for (uint32_t p = 0; p < num_ports_; ++p) {
+    auto& rsp_port = dcache_rsp_ports.at(p);
+    if (!rsp_port.empty()) {
+      // For now, we use synchronous access via core_->dcache_read/write
+      rsp_port.pop();
+    }
+  }
+}
+
+uint32_t VegetaLsu::tile_size(TileType type) {
+  switch (type) {
+    case TileType::T_TILE: return T_TILE_SIZE;
+    case TileType::U_TILE: return U_TILE_SIZE;
+    case TileType::V_TILE: return V_TILE_SIZE;
+    case TileType::M_TILE: return M_TILE_SIZE;
+    default: return 0;
+  }
+}
+
+void VegetaLsu::load_tile(uint64_t addr, TileType type, uint32_t reg_idx,
+                          uint32_t wid, uint32_t tid, void* reg_data) {
+  __unused(reg_idx, wid, tid);
+  
+  uint32_t size = tile_size(type);
+  uint8_t* data = reinterpret_cast<uint8_t*>(reg_data);
+  
+  // Align address to cache line boundary for efficient access
+  uint64_t aligned_addr = addr & ~(uint64_t)(CACHE_LINE_SIZE - 1);
+  uint32_t offset = addr - aligned_addr;
+  
+  DP(2, "VEGETA_LSU: load_tile type=" << (int)type << " addr=0x" << std::hex << addr 
+     << " size=" << std::dec << size << " reg=" << reg_idx);
+  
+  // Issue cache-line sized requests
+  uint32_t bytes_loaded = 0;
+  uint64_t current_addr = addr;
+  
+  while (bytes_loaded < size) {
+    uint32_t chunk_size = std::min(size - bytes_loaded, CACHE_LINE_SIZE);
+    
+    // Use core's dcache_read for each cache line
+    core_->dcache_read(data + bytes_loaded, current_addr, chunk_size);
+    
+    ++perf_stats_.cache_requests;
+    bytes_loaded += chunk_size;
+    current_addr += chunk_size;
+  }
+  
+  ++perf_stats_.tile_loads;
+  perf_stats_.total_bytes += size;
+}
+
+void VegetaLsu::store_tile(uint64_t addr, TileType type, uint32_t reg_idx,
+                           uint32_t wid, uint32_t tid, const void* reg_data) {
+  __unused(reg_idx, wid, tid);
+  
+  uint32_t size = tile_size(type);
+  const uint8_t* data = reinterpret_cast<const uint8_t*>(reg_data);
+  
+  DP(2, "VEGETA_LSU: store_tile type=" << (int)type << " addr=0x" << std::hex << addr 
+     << " size=" << std::dec << size << " reg=" << reg_idx);
+  
+  // Issue cache-line sized requests
+  uint32_t bytes_stored = 0;
+  uint64_t current_addr = addr;
+  
+  while (bytes_stored < size) {
+    uint32_t chunk_size = std::min(size - bytes_stored, CACHE_LINE_SIZE);
+    
+    // Use core's dcache_write for each cache line
+    core_->dcache_write(data + bytes_stored, current_addr, chunk_size);
+    
+    ++perf_stats_.cache_requests;
+    bytes_stored += chunk_size;
+    current_addr += chunk_size;
+  }
+  
+  ++perf_stats_.tile_stores;
+  perf_stats_.total_bytes += size;
+}
+
+void VegetaLsu::issue_tile_requests(const TileRequest& req, void* data) {
+  if (req.is_store) {
+    store_tile(req.addr, req.type, req.reg_idx, req.wid, req.tid, data);
+  } else {
+    load_tile(req.addr, req.type, req.reg_idx, req.wid, req.tid, data);
+  }
+}
diff --git a/sim/simx/vegeta_lsu.h b/sim/simx/vegeta_lsu.h
new file mode 100644
index 000000000..df27a14a3
--- /dev/null
+++ b/sim/simx/vegeta_lsu.h
@@ -0,0 +1,102 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <simobject.h>
+#include <vector>
+#include <queue>
+#include "types.h"
+#include "instr_trace.h"
+
+namespace vortex {
+
+class Core;
+
+// VEGETA Tile LSU - Dedicated Load/Store Unit for tile operations
+// Handles bulk tile transfers (T-tile: 1KB, M-tile: 128B) via L1 D-Cache
+class VegetaLsu : public SimObject<VegetaLsu> {
+public:
+  // Tile request types
+  enum class TileType {
+    T_TILE = 0,   // 16x16 float = 1024 bytes
+    U_TILE = 1,   // 2 T-tiles = 2048 bytes  
+    V_TILE = 2,   // 4 T-tiles = 4096 bytes
+    M_TILE = 3    // 16x16 nibbles = 128 bytes (metadata)
+  };
+
+  // Tile request structure
+  struct TileRequest {
+    uint64_t addr;        // Base address
+    TileType type;        // Tile type
+    uint32_t reg_idx;     // Destination/source register index
+    bool is_store;        // true = store, false = load
+    uint32_t wid;         // Warp ID
+    uint32_t tid;         // Thread ID
+  };
+
+  struct PerfStats {
+    uint64_t tile_loads;
+    uint64_t tile_stores;
+    uint64_t cache_requests;
+    uint64_t total_bytes;
+
+    PerfStats()
+      : tile_loads(0)
+      , tile_stores(0)
+      , cache_requests(0)
+      , total_bytes(0)
+    {}
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->tile_loads += rhs.tile_loads;
+      this->tile_stores += rhs.tile_stores;
+      this->cache_requests += rhs.cache_requests;
+      this->total_bytes += rhs.total_bytes;
+      return *this;
+    }
+  };
+
+  // Memory request/response ports for L1 cache connection
+  std::vector<SimPort<MemReq>> dcache_req_ports;
+  std::vector<SimPort<MemRsp>> dcache_rsp_ports;
+
+  VegetaLsu(const SimContext& ctx, const char* name, Core* core, uint32_t num_ports);
+  ~VegetaLsu();
+
+  void reset();
+  void tick();
+
+  // Load tile from memory to register
+  void load_tile(uint64_t addr, TileType type, uint32_t reg_idx, 
+                 uint32_t wid, uint32_t tid, void* reg_data);
+
+  // Store tile from register to memory
+  void store_tile(uint64_t addr, TileType type, uint32_t reg_idx,
+                  uint32_t wid, uint32_t tid, const void* reg_data);
+
+  // Get tile size in bytes
+  static uint32_t tile_size(TileType type);
+
+  const PerfStats& perf_stats() const { return perf_stats_; }
+
+private:
+  Core* core_;
+  uint32_t num_ports_;
+  PerfStats perf_stats_;
+
+  // Issue cache-line requests for a tile
+  void issue_tile_requests(const TileRequest& req, void* data);
+};
+
+} // namespace vortex