scroll-tech · darth-cy · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/extensions/native/circuit/cuda/src/hint_space_provider.cu b/extensions/native/circuit/cuda/src/hint_space_provider.cu
@@ -0,0 +1,56 @@
+#include "launcher.cuh"
+#include "primitives/trace_access.h"
+
+// Columns layout matches HintSpaceProviderCols<T> in hint_space_provider.rs
+// Fields: hint_id, offset, value, is_valid
+template <typename T> struct HintSpaceProviderCols {
+    T hint_id;
+    T offset;
+    T value;
+    T is_valid;
+};
+
+constexpr uint32_t HINT_SPACE_PROVIDER_WIDTH = sizeof(HintSpaceProviderCols<uint8_t>);
+
+__global__ void hint_space_provider_tracegen(
+    Fp *trace,
+    size_t height,
+    const Fp *records,
+    size_t rows_used
+) {
+    uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= height) {
+        return;
+    }
+
+    RowSlice row(trace + idx, height);
+    if (idx < rows_used) {
+        // Each record is a triple (hint_id, offset, value)
+        const Fp *rec = records + idx * 3;
+        COL_WRITE_VALUE(row, HintSpaceProviderCols, hint_id, rec[0]);
+        COL_WRITE_VALUE(row, HintSpaceProviderCols, offset, rec[1]);
+        COL_WRITE_VALUE(row, HintSpaceProviderCols, value, rec[2]);
+        COL_WRITE_VALUE(row, HintSpaceProviderCols, is_valid, Fp::one());
+    } else {
+        row.fill_zero(0, HINT_SPACE_PROVIDER_WIDTH);
+    }
+}
+
+extern "C" int _hint_space_provider_tracegen(
+    Fp *d_trace,
+    size_t height,
+    size_t width,
+    const Fp *d_records,
+    size_t rows_used
+) {
+    assert((height & (height - 1)) == 0);
+    assert(width == HINT_SPACE_PROVIDER_WIDTH);
+    auto [grid, block] = kernel_launch_params(height);
+    hint_space_provider_tracegen<<<grid, block>>>(
+        d_trace,
+        height,
+        d_records,
+        rows_used
+    );
+    return CHECK_KERNEL();
+}
diff --git a/extensions/native/circuit/src/cuda_abi.rs b/extensions/native/circuit/src/cuda_abi.rs
@@ -345,3 +345,33 @@ pub mod native_jal_rangecheck_cuda {
         ))
     }
 }
+
+pub mod hint_space_provider_cuda {
+    use super::*;
+
+    extern "C" {
+        pub fn _hint_space_provider_tracegen(
+            d_trace: *mut F,
+            height: usize,
+            width: usize,
+            d_records: *const F,
+            rows_used: usize,
+        ) -> i32;
+    }
+
+    pub unsafe fn tracegen(
+        d_trace: &DeviceBuffer<F>,
+        height: usize,
+        width: usize,
+        d_records: &DeviceBuffer<F>,
+        rows_used: usize,
+    ) -> Result<(), CudaError> {
+        CudaError::from_result(_hint_space_provider_tracegen(
+            d_trace.as_mut_ptr(),
+            height,
+            width,
+            d_records.as_ptr(),
+            rows_used,
+        ))
+    }
+}
diff --git a/extensions/native/circuit/src/extension/cuda.rs b/extensions/native/circuit/src/extension/cuda.rs
@@ -1,3 +1,5 @@
+use std::sync::Arc;
+
 use openvm_circuit::{
     arch::{ChipInventory, ChipInventoryError, DenseRecordArena, VmProverExtension},
     system::cuda::extensions::get_inventory_range_checker,
@@ -14,6 +16,7 @@ use crate::{
     field_arithmetic::{FieldArithmeticAir, FieldArithmeticChipGpu},
     field_extension::{FieldExtensionAir, FieldExtensionChipGpu},
     fri::{FriReducedOpeningAir, FriReducedOpeningChipGpu},
+    hint_space_provider::{cuda::HintSpaceProviderChipGpu, HintSpaceProviderAir, HintSpaceProviderChip},
     jal_rangecheck::{JalRangeCheckAir, JalRangeCheckGpu},
     loadstore::{NativeLoadStoreAir, NativeLoadStoreChipGpu},
     poseidon2::{air::NativePoseidon2Air, NativePoseidon2ChipGpu},
@@ -76,8 +79,16 @@ impl VmProverExtension<GpuBabyBearPoseidon2Engine, DenseRecordArena, Native>
         let poseidon2 = NativePoseidon2ChipGpu::<1>::new(range_checker.clone(), timestamp_max_bits);
         inventory.add_executor_chip(poseidon2);
 
+        // HintSpaceProvider must be registered BEFORE NativeSumcheck because chips are
+        // dispatched in reverse order: sumcheck runs first and populates the provider.
+        let hint_air: &HintSpaceProviderAir = inventory.next_air::<HintSpaceProviderAir>()?;
+        let cpu_chip = Arc::new(HintSpaceProviderChip::new(hint_air.hint_bus));
+        let provider_gpu = HintSpaceProviderChipGpu::new(cpu_chip.clone());
+        inventory.add_periphery_chip(provider_gpu);
+
         inventory.next_air::<NativeSumcheckAir>()?;
-        let sumcheck = NativeSumcheckChipGpu::new(range_checker.clone(), timestamp_max_bits);
+        let sumcheck =
+            NativeSumcheckChipGpu::new(range_checker.clone(), timestamp_max_bits, cpu_chip);
         inventory.add_executor_chip(sumcheck);
 
         Ok(())

diff --git a/extensions/native/circuit/src/hint_space_provider.rs b/extensions/native/circuit/src/hint_space_provider.rs
@@ -130,3 +130,56 @@ impl<F: PrimeField32> ChipUsageGetter for HintSpaceProviderChip<F> {
         NUM_HINT_SPACE_PROVIDER_COLS
     }
 }
+
+#[cfg(feature = "cuda")]
+pub mod cuda {
+    use std::sync::Arc;
+
+    use openvm_circuit::arch::DenseRecordArena;
+    use openvm_cuda_backend::{base::DeviceMatrix, prover_backend::GpuBackend, types::F};
+    use openvm_cuda_common::copy::MemCopyH2D;
+    use openvm_stark_backend::{prover::types::AirProvingContext, Chip};
+
+    use super::{HintSpaceProviderChip, NUM_HINT_SPACE_PROVIDER_COLS};
+    use crate::cuda_abi::hint_space_provider_cuda;
+
+    pub struct HintSpaceProviderChipGpu {
+        pub cpu_chip: Arc<HintSpaceProviderChip<F>>,
+    }
+
+    impl HintSpaceProviderChipGpu {
+        pub fn new(cpu_chip: Arc<HintSpaceProviderChip<F>>) -> Self {
+            Self { cpu_chip }
+        }
+    }
+
+    impl Chip<DenseRecordArena, GpuBackend> for HintSpaceProviderChipGpu {
+        fn generate_proving_ctx(&self, _: DenseRecordArena) -> AirProvingContext<GpuBackend> {
+            let data = std::mem::take(&mut *self.cpu_chip.data.lock().unwrap());
+            let rows_used = data.len();
+            let height = rows_used.next_power_of_two().max(2);
+
+            // Flatten (hint_id, offset, value) triples into a contiguous [F] buffer
+            let flat: Vec<F> = data
+                .into_iter()
+                .flat_map(|(h, o, v)| [h, o, v])
+                .collect();
+
+            let d_records = flat.to_device().unwrap();
+            let trace = DeviceMatrix::<F>::with_capacity(height, NUM_HINT_SPACE_PROVIDER_COLS);
+
+            unsafe {
+                hint_space_provider_cuda::tracegen(
+                    trace.buffer(),
+                    height,
+                    NUM_HINT_SPACE_PROVIDER_COLS,
+                    &d_records,
+                    rows_used,
+                )
+                .unwrap();
+            }
+
+            AirProvingContext::simple_no_pis(trace)
+        }
+    }
+}
diff --git a/extensions/native/circuit/src/sumcheck/cuda.rs b/extensions/native/circuit/src/sumcheck/cuda.rs
@@ -1,4 +1,4 @@
-use std::{mem::size_of, slice::from_raw_parts, sync::Arc};
+use std::{borrow::Borrow, mem::size_of, slice::from_raw_parts, sync::Arc};
 
 use derive_new::new;
 use openvm_circuit::{arch::DenseRecordArena, utils::next_power_of_two_or_zero};
@@ -7,15 +7,70 @@ use openvm_cuda_backend::{
     base::DeviceMatrix, chip::get_empty_air_proving_ctx, prover_backend::GpuBackend, types::F,
 };
 use openvm_cuda_common::copy::MemCopyH2D;
-use openvm_stark_backend::{prover::types::AirProvingContext, Chip};
+use openvm_stark_backend::{p3_field::PrimeField32, prover::types::AirProvingContext, Chip};
 
-use super::columns::NativeSumcheckCols;
-use crate::cuda_abi::sumcheck_cuda;
+use super::columns::{LogupSpecificCols, NativeSumcheckCols, ProdSpecificCols};
+use crate::{
+    cuda_abi::sumcheck_cuda,
+    hint_space_provider::SharedHintSpaceProviderChip,
+};
 
 #[derive(new)]
 pub struct NativeSumcheckChipGpu {
     pub range_checker: Arc<VariableRangeCheckerChipGPU>,
     pub timestamp_max_bits: usize,
+    pub hint_space_provider: SharedHintSpaceProviderChip<F>,
+}
+
+impl NativeSumcheckChipGpu {
+    /// Scans execution records to populate the hint space provider with
+    /// (hint_id, offset, value) triples for each hint element referenced
+    /// by prod and logup rows. This bridges the gap between CPU execution
+    /// (which produces the records) and GPU trace generation.
+    fn populate_hint_provider(&self, records: &[u8]) {
+        let width = NativeSumcheckCols::<F>::width();
+        let record_size = width * size_of::<F>();
+        if records.len() % record_size != 0 {
+            return;
+        }
+        let num_rows = records.len() / record_size;
+
+        let row_slice = unsafe {
+            let ptr = records.as_ptr() as *const F;
+            from_raw_parts(ptr, num_rows * width)
+        };
+
+        for i in 0..num_rows {
+            let row_data = &row_slice[i * width..(i + 1) * width];
+            let cols: &NativeSumcheckCols<F> = row_data.borrow();
+
+            if cols.within_round_limit != F::ONE {
+                continue;
+            }
+
+            if cols.prod_row == F::ONE {
+                let prod_specific: &ProdSpecificCols<F> =
+                    cols.specific[..ProdSpecificCols::<F>::width()].borrow();
+                for (j, &val) in prod_specific.p.iter().enumerate() {
+                    self.hint_space_provider.request(
+                        cols.prod_hint_id,
+                        prod_specific.data_ptr + F::from_canonical_usize(j),
+                        val,
+                    );
+                }
+            } else if cols.logup_row == F::ONE {
+                let logup_specific: &LogupSpecificCols<F> =
+                    cols.specific[..LogupSpecificCols::<F>::width()].borrow();
+                for (j, &val) in logup_specific.pq.iter().enumerate() {
+                    self.hint_space_provider.request(
+                        cols.logup_hint_id,
+                        logup_specific.data_ptr + F::from_canonical_usize(j),
+                        val,
+                    );
+                }
+            }
+        }
+    }
 }
 
 impl Chip<DenseRecordArena, GpuBackend> for NativeSumcheckChipGpu {
@@ -25,6 +80,9 @@ impl Chip<DenseRecordArena, GpuBackend> for NativeSumcheckChipGpu {
             return get_empty_air_proving_ctx::<GpuBackend>();
         }
 
+        // Populate hint space provider from execution records before GPU upload.
+        self.populate_hint_provider(records);
+
         let width = NativeSumcheckCols::<F>::width();
         let record_size = width * size_of::<F>();
         assert_eq!(records.len() % record_size, 0);