diff --git a/extensions/native/circuit/cuda/src/hint_space_provider.cu b/extensions/native/circuit/cuda/src/hint_space_provider.cu new file mode 100644 index 0000000000..a87c1155ac --- /dev/null +++ b/extensions/native/circuit/cuda/src/hint_space_provider.cu @@ -0,0 +1,56 @@ +#include "launcher.cuh" +#include "primitives/trace_access.h" + +// Columns layout matches HintSpaceProviderCols in hint_space_provider.rs +// Fields: hint_id, offset, value, is_valid +template struct HintSpaceProviderCols { + T hint_id; + T offset; + T value; + T is_valid; +}; + +constexpr uint32_t HINT_SPACE_PROVIDER_WIDTH = sizeof(HintSpaceProviderCols); + +__global__ void hint_space_provider_tracegen( + Fp *trace, + size_t height, + const Fp *records, + size_t rows_used +) { + uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= height) { + return; + } + + RowSlice row(trace + idx, height); + if (idx < rows_used) { + // Each record is a triple (hint_id, offset, value) + const Fp *rec = records + idx * 3; + COL_WRITE_VALUE(row, HintSpaceProviderCols, hint_id, rec[0]); + COL_WRITE_VALUE(row, HintSpaceProviderCols, offset, rec[1]); + COL_WRITE_VALUE(row, HintSpaceProviderCols, value, rec[2]); + COL_WRITE_VALUE(row, HintSpaceProviderCols, is_valid, Fp::one()); + } else { + row.fill_zero(0, HINT_SPACE_PROVIDER_WIDTH); + } +} + +extern "C" int _hint_space_provider_tracegen( + Fp *d_trace, + size_t height, + size_t width, + const Fp *d_records, + size_t rows_used +) { + assert((height & (height - 1)) == 0); + assert(width == HINT_SPACE_PROVIDER_WIDTH); + auto [grid, block] = kernel_launch_params(height); + hint_space_provider_tracegen<<>>( + d_trace, + height, + d_records, + rows_used + ); + return CHECK_KERNEL(); +} diff --git a/extensions/native/circuit/src/cuda_abi.rs b/extensions/native/circuit/src/cuda_abi.rs index 5de9124f0d..4530290521 100644 --- a/extensions/native/circuit/src/cuda_abi.rs +++ b/extensions/native/circuit/src/cuda_abi.rs @@ -345,3 +345,33 @@ pub mod native_jal_rangecheck_cuda { )) } } + +pub mod hint_space_provider_cuda { + use super::*; + + extern "C" { + pub fn _hint_space_provider_tracegen( + d_trace: *mut F, + height: usize, + width: usize, + d_records: *const F, + rows_used: usize, + ) -> i32; + } + + pub unsafe fn tracegen( + d_trace: &DeviceBuffer, + height: usize, + width: usize, + d_records: &DeviceBuffer, + rows_used: usize, + ) -> Result<(), CudaError> { + CudaError::from_result(_hint_space_provider_tracegen( + d_trace.as_mut_ptr(), + height, + width, + d_records.as_ptr(), + rows_used, + )) + } +} diff --git a/extensions/native/circuit/src/extension/cuda.rs b/extensions/native/circuit/src/extension/cuda.rs index 765ce8d6cc..9ae1558500 100644 --- a/extensions/native/circuit/src/extension/cuda.rs +++ b/extensions/native/circuit/src/extension/cuda.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use openvm_circuit::{ arch::{ChipInventory, ChipInventoryError, DenseRecordArena, VmProverExtension}, system::cuda::extensions::get_inventory_range_checker, @@ -14,6 +16,7 @@ use crate::{ field_arithmetic::{FieldArithmeticAir, FieldArithmeticChipGpu}, field_extension::{FieldExtensionAir, FieldExtensionChipGpu}, fri::{FriReducedOpeningAir, FriReducedOpeningChipGpu}, + hint_space_provider::{cuda::HintSpaceProviderChipGpu, HintSpaceProviderAir, HintSpaceProviderChip}, jal_rangecheck::{JalRangeCheckAir, JalRangeCheckGpu}, loadstore::{NativeLoadStoreAir, NativeLoadStoreChipGpu}, poseidon2::{air::NativePoseidon2Air, NativePoseidon2ChipGpu}, @@ -76,8 +79,16 @@ impl VmProverExtension let poseidon2 = NativePoseidon2ChipGpu::<1>::new(range_checker.clone(), timestamp_max_bits); inventory.add_executor_chip(poseidon2); + // HintSpaceProvider must be registered BEFORE NativeSumcheck because chips are + // dispatched in reverse order: sumcheck runs first and populates the provider. + let hint_air: &HintSpaceProviderAir = inventory.next_air::()?; + let cpu_chip = Arc::new(HintSpaceProviderChip::new(hint_air.hint_bus)); + let provider_gpu = HintSpaceProviderChipGpu::new(cpu_chip.clone()); + inventory.add_periphery_chip(provider_gpu); + inventory.next_air::()?; - let sumcheck = NativeSumcheckChipGpu::new(range_checker.clone(), timestamp_max_bits); + let sumcheck = + NativeSumcheckChipGpu::new(range_checker.clone(), timestamp_max_bits, cpu_chip); inventory.add_executor_chip(sumcheck); Ok(()) diff --git a/extensions/native/circuit/src/hint_space_provider.rs b/extensions/native/circuit/src/hint_space_provider.rs index 4feac677b6..65c26bb299 100644 --- a/extensions/native/circuit/src/hint_space_provider.rs +++ b/extensions/native/circuit/src/hint_space_provider.rs @@ -130,3 +130,56 @@ impl ChipUsageGetter for HintSpaceProviderChip { NUM_HINT_SPACE_PROVIDER_COLS } } + +#[cfg(feature = "cuda")] +pub mod cuda { + use std::sync::Arc; + + use openvm_circuit::arch::DenseRecordArena; + use openvm_cuda_backend::{base::DeviceMatrix, prover_backend::GpuBackend, types::F}; + use openvm_cuda_common::copy::MemCopyH2D; + use openvm_stark_backend::{prover::types::AirProvingContext, Chip}; + + use super::{HintSpaceProviderChip, NUM_HINT_SPACE_PROVIDER_COLS}; + use crate::cuda_abi::hint_space_provider_cuda; + + pub struct HintSpaceProviderChipGpu { + pub cpu_chip: Arc>, + } + + impl HintSpaceProviderChipGpu { + pub fn new(cpu_chip: Arc>) -> Self { + Self { cpu_chip } + } + } + + impl Chip for HintSpaceProviderChipGpu { + fn generate_proving_ctx(&self, _: DenseRecordArena) -> AirProvingContext { + let data = std::mem::take(&mut *self.cpu_chip.data.lock().unwrap()); + let rows_used = data.len(); + let height = rows_used.next_power_of_two().max(2); + + // Flatten (hint_id, offset, value) triples into a contiguous [F] buffer + let flat: Vec = data + .into_iter() + .flat_map(|(h, o, v)| [h, o, v]) + .collect(); + + let d_records = flat.to_device().unwrap(); + let trace = DeviceMatrix::::with_capacity(height, NUM_HINT_SPACE_PROVIDER_COLS); + + unsafe { + hint_space_provider_cuda::tracegen( + trace.buffer(), + height, + NUM_HINT_SPACE_PROVIDER_COLS, + &d_records, + rows_used, + ) + .unwrap(); + } + + AirProvingContext::simple_no_pis(trace) + } + } +} diff --git a/extensions/native/circuit/src/sumcheck/cuda.rs b/extensions/native/circuit/src/sumcheck/cuda.rs index 60aba15b95..7dce367a5b 100644 --- a/extensions/native/circuit/src/sumcheck/cuda.rs +++ b/extensions/native/circuit/src/sumcheck/cuda.rs @@ -1,4 +1,4 @@ -use std::{mem::size_of, slice::from_raw_parts, sync::Arc}; +use std::{borrow::Borrow, mem::size_of, slice::from_raw_parts, sync::Arc}; use derive_new::new; use openvm_circuit::{arch::DenseRecordArena, utils::next_power_of_two_or_zero}; @@ -7,15 +7,70 @@ use openvm_cuda_backend::{ base::DeviceMatrix, chip::get_empty_air_proving_ctx, prover_backend::GpuBackend, types::F, }; use openvm_cuda_common::copy::MemCopyH2D; -use openvm_stark_backend::{prover::types::AirProvingContext, Chip}; +use openvm_stark_backend::{p3_field::PrimeField32, prover::types::AirProvingContext, Chip}; -use super::columns::NativeSumcheckCols; -use crate::cuda_abi::sumcheck_cuda; +use super::columns::{LogupSpecificCols, NativeSumcheckCols, ProdSpecificCols}; +use crate::{ + cuda_abi::sumcheck_cuda, + hint_space_provider::SharedHintSpaceProviderChip, +}; #[derive(new)] pub struct NativeSumcheckChipGpu { pub range_checker: Arc, pub timestamp_max_bits: usize, + pub hint_space_provider: SharedHintSpaceProviderChip, +} + +impl NativeSumcheckChipGpu { + /// Scans execution records to populate the hint space provider with + /// (hint_id, offset, value) triples for each hint element referenced + /// by prod and logup rows. This bridges the gap between CPU execution + /// (which produces the records) and GPU trace generation. + fn populate_hint_provider(&self, records: &[u8]) { + let width = NativeSumcheckCols::::width(); + let record_size = width * size_of::(); + if records.len() % record_size != 0 { + return; + } + let num_rows = records.len() / record_size; + + let row_slice = unsafe { + let ptr = records.as_ptr() as *const F; + from_raw_parts(ptr, num_rows * width) + }; + + for i in 0..num_rows { + let row_data = &row_slice[i * width..(i + 1) * width]; + let cols: &NativeSumcheckCols = row_data.borrow(); + + if cols.within_round_limit != F::ONE { + continue; + } + + if cols.prod_row == F::ONE { + let prod_specific: &ProdSpecificCols = + cols.specific[..ProdSpecificCols::::width()].borrow(); + for (j, &val) in prod_specific.p.iter().enumerate() { + self.hint_space_provider.request( + cols.prod_hint_id, + prod_specific.data_ptr + F::from_canonical_usize(j), + val, + ); + } + } else if cols.logup_row == F::ONE { + let logup_specific: &LogupSpecificCols = + cols.specific[..LogupSpecificCols::::width()].borrow(); + for (j, &val) in logup_specific.pq.iter().enumerate() { + self.hint_space_provider.request( + cols.logup_hint_id, + logup_specific.data_ptr + F::from_canonical_usize(j), + val, + ); + } + } + } + } } impl Chip for NativeSumcheckChipGpu { @@ -25,6 +80,9 @@ impl Chip for NativeSumcheckChipGpu { return get_empty_air_proving_ctx::(); } + // Populate hint space provider from execution records before GPU upload. + self.populate_hint_provider(records); + let width = NativeSumcheckCols::::width(); let record_size = width * size_of::(); assert_eq!(records.len() % record_size, 0);