@@ -125,16 +125,16 @@ index dfd3945e9a..08eda4978c 100644
125125 return CUDA_R_16BF;
126126 #endif
127127diff --git a/paddle/phi/backends/gpu/cuda/cudnn_desc.h b/paddle/phi/backends/gpu/cuda/cudnn_desc.h
128- index 189e97534e..8f805afe8c 100644
128+ index ce038ecec9..c78e2e999f 100644
129129--- a/paddle/phi/backends/gpu/cuda/cudnn_desc.h
130130+++ b/paddle/phi/backends/gpu/cuda/cudnn_desc.h
131- @@ -77,7 +77,7 @@ inline cudnnDataType_t ToCudnnDataType(const phi:: DataType& t) {
131+ @@ -77,7 +77,7 @@ inline cudnnDataType_t ToCudnnDataType(const DataType& t) {
132132 type = CUDNN_DATA_FP8_E5M2;
133133 break;
134134 #endif
135135- #if CUDNN_VERSION_MIN(8, 1, 0)
136136+ #if CUDNN_VERSION_MIN(8, 1, 0) || defined(PADDLE_WITH_COREX)
137- case phi:: DataType::BFLOAT16:
137+ case DataType::BFLOAT16:
138138 type = CUDNN_DATA_BFLOAT16;
139139 break;
140140@@ -167,12 +167,26 @@ class TensorDescriptor {
@@ -160,10 +160,10 @@ index 189e97534e..8f805afe8c 100644
160160 dtype,
161161 transformed_dims.size(),
162162 transformed_dims.data()));
163- + #endif
163+ + #endif-
164164 }
165165
166- void set(const phi:: DenseTensor& tensor, const cudnnTensorFormat_t format) {
166+ void set(const DenseTensor& tensor, const cudnnTensorFormat_t format) {
167167diff --git a/paddle/phi/backends/gpu/cuda/cudnn_helper.h b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
168168index 9a8a3b7605..3186f37c20 100644
169169--- a/paddle/phi/backends/gpu/cuda/cudnn_helper.h
@@ -1022,3 +1022,18 @@ index ffdf995ece..4a7e03f4ad 100644
10221022 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
10231023 PD_REGISTER_KERNEL(unsqueeze,
10241024 GPU,
1025+ diff --git a/paddle/phi/kernels/legacy/gpu/fp8_quant_blockwise_kernel.cu b/paddle/phi/kernels/legacy/gpu/fp8_quant_blockwise_kernel.cu
1026+ index 1b3393ceab..6bbf4f661b 100644
1027+ --- a/paddle/phi/kernels/legacy/gpu/fp8_quant_blockwise_kernel.cu
1028+ +++ b/paddle/phi/kernels/legacy/gpu/fp8_quant_blockwise_kernel.cu
1029+ @@ -671,8 +671,8 @@ void FP8QuantBlockWiseKernelImpl(const Context &dev_ctx,
1030+ const int sm_count = phi::backends::gpu::GetGPUMultiProcessors(device_id);
1031+ const size_t min_grid_x = sm_count * 8;
1032+ const size_t min_block_x = 1024;
1033+ - const size_t gridx = min(min_grid_x, src_rows);
1034+ - const size_t blockx = min(min_block_x, src_cols / 128 * 32);
1035+ + const size_t gridx = std::min(min_grid_x, src_rows);
1036+ + const size_t blockx = std::min(min_block_x, src_cols / 128 * 32);
1037+
1038+ bool use_finegrained_range = false;
1039+ char *env_var = getenv("PER_TOKEN_QUANT_FP8_USE_FINEGRAINED_RANGE");
0 commit comments