Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions extensions/native/circuit/cuda/src/hint_space_provider.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "launcher.cuh"
#include "primitives/trace_access.h"

// Columns layout matches HintSpaceProviderCols<T> in hint_space_provider.rs
// Fields: hint_id, offset, value, is_valid
template <typename T> struct HintSpaceProviderCols {
T hint_id;
T offset;
T value;
T is_valid;
};

constexpr uint32_t HINT_SPACE_PROVIDER_WIDTH = sizeof(HintSpaceProviderCols<uint8_t>);

__global__ void hint_space_provider_tracegen(
Fp *trace,
size_t height,
const Fp *records,
size_t rows_used
) {
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= height) {
return;
}

RowSlice row(trace + idx, height);
if (idx < rows_used) {
// Each record is a triple (hint_id, offset, value)
const Fp *rec = records + idx * 3;
COL_WRITE_VALUE(row, HintSpaceProviderCols, hint_id, rec[0]);
COL_WRITE_VALUE(row, HintSpaceProviderCols, offset, rec[1]);
COL_WRITE_VALUE(row, HintSpaceProviderCols, value, rec[2]);
COL_WRITE_VALUE(row, HintSpaceProviderCols, is_valid, Fp::one());
} else {
row.fill_zero(0, HINT_SPACE_PROVIDER_WIDTH);
}
}

extern "C" int _hint_space_provider_tracegen(
Fp *d_trace,
size_t height,
size_t width,
const Fp *d_records,
size_t rows_used
) {
assert((height & (height - 1)) == 0);
assert(width == HINT_SPACE_PROVIDER_WIDTH);
auto [grid, block] = kernel_launch_params(height);
hint_space_provider_tracegen<<<grid, block>>>(
d_trace,
height,
d_records,
rows_used
);
return CHECK_KERNEL();
}
30 changes: 30 additions & 0 deletions extensions/native/circuit/src/cuda_abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,3 +345,33 @@ pub mod native_jal_rangecheck_cuda {
))
}
}

pub mod hint_space_provider_cuda {
use super::*;

extern "C" {
pub fn _hint_space_provider_tracegen(
d_trace: *mut F,
height: usize,
width: usize,
d_records: *const F,
rows_used: usize,
) -> i32;
}

pub unsafe fn tracegen(
d_trace: &DeviceBuffer<F>,
height: usize,
width: usize,
d_records: &DeviceBuffer<F>,
rows_used: usize,
) -> Result<(), CudaError> {
CudaError::from_result(_hint_space_provider_tracegen(
d_trace.as_mut_ptr(),
height,
width,
d_records.as_ptr(),
rows_used,
))
}
}
13 changes: 12 additions & 1 deletion extensions/native/circuit/src/extension/cuda.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::Arc;

use openvm_circuit::{
arch::{ChipInventory, ChipInventoryError, DenseRecordArena, VmProverExtension},
system::cuda::extensions::get_inventory_range_checker,
Expand All @@ -14,6 +16,7 @@ use crate::{
field_arithmetic::{FieldArithmeticAir, FieldArithmeticChipGpu},
field_extension::{FieldExtensionAir, FieldExtensionChipGpu},
fri::{FriReducedOpeningAir, FriReducedOpeningChipGpu},
hint_space_provider::{cuda::HintSpaceProviderChipGpu, HintSpaceProviderAir, HintSpaceProviderChip},
jal_rangecheck::{JalRangeCheckAir, JalRangeCheckGpu},
loadstore::{NativeLoadStoreAir, NativeLoadStoreChipGpu},
poseidon2::{air::NativePoseidon2Air, NativePoseidon2ChipGpu},
Expand Down Expand Up @@ -76,8 +79,16 @@ impl VmProverExtension<GpuBabyBearPoseidon2Engine, DenseRecordArena, Native>
let poseidon2 = NativePoseidon2ChipGpu::<1>::new(range_checker.clone(), timestamp_max_bits);
inventory.add_executor_chip(poseidon2);

// HintSpaceProvider must be registered BEFORE NativeSumcheck because chips are
// dispatched in reverse order: sumcheck runs first and populates the provider.
let hint_air: &HintSpaceProviderAir = inventory.next_air::<HintSpaceProviderAir>()?;
let cpu_chip = Arc::new(HintSpaceProviderChip::new(hint_air.hint_bus));
let provider_gpu = HintSpaceProviderChipGpu::new(cpu_chip.clone());
inventory.add_periphery_chip(provider_gpu);

inventory.next_air::<NativeSumcheckAir>()?;
let sumcheck = NativeSumcheckChipGpu::new(range_checker.clone(), timestamp_max_bits);
let sumcheck =
NativeSumcheckChipGpu::new(range_checker.clone(), timestamp_max_bits, cpu_chip);
inventory.add_executor_chip(sumcheck);

Ok(())
Expand Down
53 changes: 53 additions & 0 deletions extensions/native/circuit/src/hint_space_provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,56 @@ impl<F: PrimeField32> ChipUsageGetter for HintSpaceProviderChip<F> {
NUM_HINT_SPACE_PROVIDER_COLS
}
}

#[cfg(feature = "cuda")]
pub mod cuda {
use std::sync::Arc;

use openvm_circuit::arch::DenseRecordArena;
use openvm_cuda_backend::{base::DeviceMatrix, prover_backend::GpuBackend, types::F};
use openvm_cuda_common::copy::MemCopyH2D;
use openvm_stark_backend::{prover::types::AirProvingContext, Chip};

use super::{HintSpaceProviderChip, NUM_HINT_SPACE_PROVIDER_COLS};
use crate::cuda_abi::hint_space_provider_cuda;

pub struct HintSpaceProviderChipGpu {
pub cpu_chip: Arc<HintSpaceProviderChip<F>>,
}

impl HintSpaceProviderChipGpu {
pub fn new(cpu_chip: Arc<HintSpaceProviderChip<F>>) -> Self {
Self { cpu_chip }
}
}

impl Chip<DenseRecordArena, GpuBackend> for HintSpaceProviderChipGpu {
fn generate_proving_ctx(&self, _: DenseRecordArena) -> AirProvingContext<GpuBackend> {
let data = std::mem::take(&mut *self.cpu_chip.data.lock().unwrap());
let rows_used = data.len();
let height = rows_used.next_power_of_two().max(2);

// Flatten (hint_id, offset, value) triples into a contiguous [F] buffer
let flat: Vec<F> = data
.into_iter()
.flat_map(|(h, o, v)| [h, o, v])
.collect();

let d_records = flat.to_device().unwrap();
let trace = DeviceMatrix::<F>::with_capacity(height, NUM_HINT_SPACE_PROVIDER_COLS);

unsafe {
hint_space_provider_cuda::tracegen(
trace.buffer(),
height,
NUM_HINT_SPACE_PROVIDER_COLS,
&d_records,
rows_used,
)
.unwrap();
}

AirProvingContext::simple_no_pis(trace)
}
}
}
66 changes: 62 additions & 4 deletions extensions/native/circuit/src/sumcheck/cuda.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{mem::size_of, slice::from_raw_parts, sync::Arc};
use std::{borrow::Borrow, mem::size_of, slice::from_raw_parts, sync::Arc};

use derive_new::new;
use openvm_circuit::{arch::DenseRecordArena, utils::next_power_of_two_or_zero};
Expand All @@ -7,15 +7,70 @@ use openvm_cuda_backend::{
base::DeviceMatrix, chip::get_empty_air_proving_ctx, prover_backend::GpuBackend, types::F,
};
use openvm_cuda_common::copy::MemCopyH2D;
use openvm_stark_backend::{prover::types::AirProvingContext, Chip};
use openvm_stark_backend::{p3_field::PrimeField32, prover::types::AirProvingContext, Chip};

use super::columns::NativeSumcheckCols;
use crate::cuda_abi::sumcheck_cuda;
use super::columns::{LogupSpecificCols, NativeSumcheckCols, ProdSpecificCols};
use crate::{
cuda_abi::sumcheck_cuda,
hint_space_provider::SharedHintSpaceProviderChip,
};

#[derive(new)]
pub struct NativeSumcheckChipGpu {
pub range_checker: Arc<VariableRangeCheckerChipGPU>,
pub timestamp_max_bits: usize,
pub hint_space_provider: SharedHintSpaceProviderChip<F>,
}

impl NativeSumcheckChipGpu {
/// Scans execution records to populate the hint space provider with
/// (hint_id, offset, value) triples for each hint element referenced
/// by prod and logup rows. This bridges the gap between CPU execution
/// (which produces the records) and GPU trace generation.
fn populate_hint_provider(&self, records: &[u8]) {
let width = NativeSumcheckCols::<F>::width();
let record_size = width * size_of::<F>();
if records.len() % record_size != 0 {
return;
}
let num_rows = records.len() / record_size;

let row_slice = unsafe {
let ptr = records.as_ptr() as *const F;
from_raw_parts(ptr, num_rows * width)
};

for i in 0..num_rows {
let row_data = &row_slice[i * width..(i + 1) * width];
let cols: &NativeSumcheckCols<F> = row_data.borrow();

if cols.within_round_limit != F::ONE {
continue;
}

if cols.prod_row == F::ONE {
let prod_specific: &ProdSpecificCols<F> =
cols.specific[..ProdSpecificCols::<F>::width()].borrow();
for (j, &val) in prod_specific.p.iter().enumerate() {
self.hint_space_provider.request(
cols.prod_hint_id,
prod_specific.data_ptr + F::from_canonical_usize(j),
val,
);
}
} else if cols.logup_row == F::ONE {
let logup_specific: &LogupSpecificCols<F> =
cols.specific[..LogupSpecificCols::<F>::width()].borrow();
for (j, &val) in logup_specific.pq.iter().enumerate() {
self.hint_space_provider.request(
cols.logup_hint_id,
logup_specific.data_ptr + F::from_canonical_usize(j),
val,
);
}
}
}
}
}

impl Chip<DenseRecordArena, GpuBackend> for NativeSumcheckChipGpu {
Expand All @@ -25,6 +80,9 @@ impl Chip<DenseRecordArena, GpuBackend> for NativeSumcheckChipGpu {
return get_empty_air_proving_ctx::<GpuBackend>();
}

// Populate hint space provider from execution records before GPU upload.
self.populate_hint_provider(records);

let width = NativeSumcheckCols::<F>::width();
let record_size = width * size_of::<F>();
assert_eq!(records.len() % record_size, 0);
Expand Down
Loading