From 758804a4bc359fccb75318568023f62999e38caa Mon Sep 17 00:00:00 2001 From: anthony-zy Date: Tue, 29 Jul 2025 14:08:43 +0800 Subject: [PATCH 1/4] Use high-performance FindMatchLength to Optimize Snappy compression speed For RISC-V[skip ci] --- snappy-internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/snappy-internal.h b/snappy-internal.h index ae78247..53a5110 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -172,9 +172,10 @@ char* CompressFragment(const char* input, // loading from s2 + n. // // Separate implementation for 64-bit, little-endian cpus. +// riscv and little-endian cpu choose this routinue can be done faster too. #if !SNAPPY_IS_BIG_ENDIAN && \ (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \ - defined(ARCH_ARM)) + defined(ARCH_ARM) || defined(__riscv)) static inline std::pair FindMatchLength(const char* s1, const char* s2, const char* s2_limit, From 1f6ba69d6716136588836d241f9a0fb5e18c6b6f Mon Sep 17 00:00:00 2001 From: anthony-zy Date: Thu, 28 Aug 2025 20:11:39 +0800 Subject: [PATCH 2/4] add RVV support and optimized uncompress speed --- CMakeLists.txt | 25 ++++++++++++ snappy-internal.h | 49 ++++++++++++++++++++++- snappy.cc | 100 +++++++++++++++++++++++++++++++++++++++------- 3 files changed, 158 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cd71a47..490f5b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,6 +216,31 @@ int main() { return 0; }" SNAPPY_HAVE_NEON) +#check RVV 1.0 need __riscv_ prefix +check_cxx_source_compiles(" + #include + #include + #include + int main() { + uint8_t val = 3, dup[8]; + size_t vl = __riscv_vsetvl_e8m1(8); + vuint8m1_t v = __riscv_vmv_v_x_u8m1(val, vl); + return 0; + }" SNAPPY_RVV_1) + + +#check RVV 0.7.1 not __riscv_ prefix +check_cxx_source_compiles(" + #include + #include + #include + int main() { + uint8_t val = 3, dup[8]; + size_t vl = vsetvl_e8m1(8); + vuint8m1_t v = vmv_v_x_u8m1(val, vl); + return 0; + }" SNAPPY_RVV_0_7) + include(CheckSymbolExists) check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP) check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF) diff --git a/snappy-internal.h b/snappy-internal.h index 53a5110..1c9d043 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -46,7 +46,13 @@ #include #endif -#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON +#if SNAPPY_RVV_1 || SNAPPY_RVV_0_7 +#define SNAPPY_HAVE_RVV 1 +#include +#endif + + +#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON || SNAPPY_HAVE_RVV #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1 #else #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0 @@ -60,8 +66,23 @@ namespace internal { using V128 = __m128i; #elif SNAPPY_HAVE_NEON using V128 = uint8x16_t; +#elif SNAPPY_HAVE_RVV +using V128 = vuint8m1_t; #endif +#ifdef SNAPPY_RVV_1 +#define VSETVL_E8M1 __riscv_vsetvl_e8m1 +#define VLE8_V_U8M1 __riscv_vle8_v_u8m1 +#define VSE8_V_U8M1 __riscv_vse8_v_u8m1 +#define VRGATHER_VV_U8M1 __riscv_vrgather_vv_u8m1 +#define VMV_V_X_U8M1 __riscv_vmv_v_x_u8m1 +#elif SNAPPY_RVV_0_7 +#define VSETVL_E8M1 vsetvl_e8m1 +#define VLE8_V_U8M1 vle8_v_u8m1 +#define VSE8_V_U8M1 vse8_v_u8m1 +#define VRGATHER_VV_U8M1 vrgather_vv_u8m1 +#define VMV_V_X_U8M1 vmv_v_x_u8m1 +#endif // Load 128 bits of integer data. `src` must be 16-byte aligned. inline V128 V128_Load(const V128* src); @@ -110,6 +131,32 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { } inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); } + +#elif SNAPPY_HAVE_RVV +inline V128 V128_Load(const V128* src) { + size_t vl = VSETVL_E8M1(16); + return VLE8_V_U8M1(reinterpret_cast(src), vl); +} + +inline V128 V128_LoadU(const V128* src) { + size_t vl = VSETVL_E8M1(16); + return VLE8_V_U8M1(reinterpret_cast(src), vl); +} + +inline void V128_StoreU(V128* dst, V128 val) { + size_t vl = VSETVL_E8M1(16); + VSE8_V_U8M1(reinterpret_cast(dst), val, vl); +} + +inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { + size_t vl = VSETVL_E8M1(16); + return VRGATHER_VV_U8M1(input, shuffle_mask, vl); +} + +inline V128 V128_DupChar(char c) { + size_t vl = VSETVL_E8M1(16); + return VMV_V_X_U8M1(static_cast(c), vl); +} #endif #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE diff --git a/snappy.cc b/snappy.cc index 8dc3713..421b335 100644 --- a/snappy.cc +++ b/snappy.cc @@ -281,6 +281,20 @@ inline char* IncrementalCopySlow(const char* src, char* op, // 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by // calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and // MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively. + +// Selects the appropriate vector size based on the current architecture +// vuint8m1_t, RISC-V vector type with fixed 128-bit size +// (sizeof not used due to variable-length vector register in RVV) +#if defined(__SSE2__) || defined(SNAPPY_HAVE_SSSE3) +constexpr size_t kVectorSize = sizeof(V128); // __m128i +#elif defined(__ARM_NEON) || defined(SNAPPY_HAVE_NEON) +constexpr size_t kVectorSize = sizeof(uint8x16_t); // uint8x16_t +#elif defined(SNAPPY_HAVE_RVV) || defined(__riscv_vector) +constexpr size_t kVectorSize = 16; // vuint8m1_t +#else +#error "Unsupported architecture. Please define __SSE2__, __ARM_NEON, or SNAPPY_HAVE_RVV/__riscv_vector." +#endif + template inline constexpr std::array MakePatternMaskBytes( int index_offset, int pattern_size, index_sequence) { @@ -290,19 +304,17 @@ inline constexpr std::array MakePatternMaskBytes( // Computes the shuffle control mask bytes array for given pattern-sizes and // returns an array. template -inline constexpr std::array, +inline constexpr std::array, sizeof...(pattern_sizes_minus_one)> MakePatternMaskBytesTable(int index_offset, index_sequence) { - return { - MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1, - make_index_sequence())...}; + return {MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1, + make_index_sequence())...}; } - // This is an array of shuffle control masks that can be used as the source // operand for PSHUFB to permute the contents of the destination XMM register // into a repeating byte pattern. -alignas(16) constexpr std::array, +alignas(16) constexpr std::array, 16> pattern_generation_masks = MakePatternMaskBytesTable( /*index_offset=*/0, @@ -313,7 +325,7 @@ alignas(16) constexpr std::array, // Basically, pattern_reshuffle_masks is a continuation of // pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as // pattern_generation_masks for offsets 1, 2, 4, 8 and 16. -alignas(16) constexpr std::array, +alignas(16) constexpr std::array, 16> pattern_reshuffle_masks = MakePatternMaskBytesTable( /*index_offset=*/16, @@ -329,6 +341,21 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) { return V128_Shuffle(V128_LoadU(reinterpret_cast(src)), generation_mask); } +// vuint8m1_t cannot be used as an element of std::pair +#if SNAPPY_HAVE_RVV +#define LoadPatternAndReshuffleMask(src, pattern_size) \ + V128 pattern = LoadPattern(src, pattern_size);\ + V128 reshuffle_mask = V128_Load(reinterpret_cast(\ + pattern_reshuffle_masks[pattern_size - 1].data())); +#else + +// Suppress -Wignored-attributes warning for __m128i in x86 SSE2 environment +// warning: ignoring attributes on template argument 'snappy::internal::V128' {aka '__vector(2) long long int'} [-Wignored-attributes] +// This occurs because __m128i has vector attributes (e.g., __attribute__((vector_size(16)))) that are ignored in template parameters. +#ifdef __SSE2__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wignored-attributes" +#endif SNAPPY_ATTRIBUTE_ALWAYS_INLINE static inline std::pair @@ -345,7 +372,12 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { pattern_reshuffle_masks[pattern_size - 1].data())); return {pattern, reshuffle_mask}; } +// Restore original diagnostic state in x86 SSE2 environment +#ifdef __SSE2__ +#pragma GCC diagnostic pop +#endif +#endif #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE // Fallback for when we need to copy while extending the pattern, for example @@ -379,10 +411,14 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) { return true; } default: { + #if SNAPPY_HAVE_RVV + LoadPatternAndReshuffleMask(dst - offset, offset) + #else auto pattern_and_reshuffle_mask = LoadPatternAndReshuffleMask(dst - offset, offset); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; + #endif for (int i = 0; i < 4; i++) { V128_StoreU(reinterpret_cast(dst + 16 * i), pattern); pattern = V128_Shuffle(pattern, reshuffle_mask); @@ -490,11 +526,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, // Typically, the op_limit is the gating factor so try to simplify the loop // based on that. if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { + #if SNAPPY_HAVE_RVV + LoadPatternAndReshuffleMask(src, pattern_size); + #else auto pattern_and_reshuffle_mask = LoadPatternAndReshuffleMask(src, pattern_size); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; - + #endif // There is at least one, and at most four 16-byte blocks. Writing four // conditionals instead of a loop allows FDO to layout the code with // respect to the actual probabilities of each length. @@ -517,11 +556,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, } char* const op_end = buf_limit - 15; if (SNAPPY_PREDICT_TRUE(op < op_end)) { + #if SNAPPY_HAVE_RVV + LoadPatternAndReshuffleMask(src, pattern_size); + #else auto pattern_and_reshuffle_mask = LoadPatternAndReshuffleMask(src, pattern_size); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; - + #endif // This code path is relatively cold however so we save code size // by avoiding unrolling and vectorizing. // @@ -1247,13 +1289,41 @@ void MemCopy64(char* dst, const void* src, size_t size) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data); } #else - std::memmove(dst, src, kShortMemCopy); - // Profiling shows that nearly all copies are short. - if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { - std::memmove(dst + kShortMemCopy, - static_cast(src) + kShortMemCopy, - 64 - kShortMemCopy); +#ifdef SNAPPY_HAVE_RVV +uint8_t* dst_u8 = (uint8_t*)dst; +const uint8_t* src_u8 = (const uint8_t*)src; +if (src_u8 < dst_u8 && dst_u8 < src_u8 + size) { //overlap bwd copy + size_t offset = size; + while (offset > 0) { + size_t vl = VSETVL_E8M1(offset); + offset -= vl; + vuint8m1_t vec = VLE8_V_U8M1(src_u8 + offset, vl); + VSE8_V_U8M1(dst_u8 + offset, vec, vl); + } +} else { + size_t vl = VSETVL_E8M1(size); + if (vl < size) { // if size >vl,use the max_vlen copy + size_t offset = 0; + while (offset < size) { + vl = VSETVL_E8M1(size - offset); + vuint8m1_t vec = VLE8_V_U8M1(src_u8 + offset, vl); + VSE8_V_U8M1(dst_u8 + offset, vec, vl); + offset += vl; + } + } else { // copy the leaft + vuint8m1_t vec = VLE8_V_U8M1(src_u8, vl); + VSE8_V_U8M1(dst_u8, vec, vl); + } } +#else +std::memmove(dst, src, kShortMemCopy); + //Profiling shows that nearly all copies are short. +if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { + std::memmove(dst + kShortMemCopy, + static_cast(src) + kShortMemCopy, + 64 - kShortMemCopy);} + +#endif #endif } From 802ef73aed1be6bf62447a587f459ad067914b4d Mon Sep 17 00:00:00 2001 From: anthony-zy Date: Thu, 28 Aug 2025 20:42:13 +0800 Subject: [PATCH 3/4] add RVV support and optmized uncompress speed --- CMakeLists.txt | 25 ++++++++++++ cmake/config.h.in | 6 +++ snappy-internal.h | 49 ++++++++++++++++++++++- snappy.cc | 100 +++++++++++++++++++++++++++++++++++++++------- 4 files changed, 164 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cd71a47..490f5b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,6 +216,31 @@ int main() { return 0; }" SNAPPY_HAVE_NEON) +#check RVV 1.0 need __riscv_ prefix +check_cxx_source_compiles(" + #include + #include + #include + int main() { + uint8_t val = 3, dup[8]; + size_t vl = __riscv_vsetvl_e8m1(8); + vuint8m1_t v = __riscv_vmv_v_x_u8m1(val, vl); + return 0; + }" SNAPPY_RVV_1) + + +#check RVV 0.7.1 not __riscv_ prefix +check_cxx_source_compiles(" + #include + #include + #include + int main() { + uint8_t val = 3, dup[8]; + size_t vl = vsetvl_e8m1(8); + vuint8m1_t v = vmv_v_x_u8m1(val, vl); + return 0; + }" SNAPPY_RVV_0_7) + include(CheckSymbolExists) check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP) check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF) diff --git a/cmake/config.h.in b/cmake/config.h.in index 3510c27..de80c5f 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -58,6 +58,12 @@ /* Define to 1 if you target processors with NEON and have . */ #cmakedefine01 SNAPPY_HAVE_NEON +/* Define to 1 if you target processors with RVV1.0 and have . */ +#cmakedefine01 SNAPPY_RVV_1 + +/* Define to 1 if you target processors with RVV0.7 and have . */ +#cmakedefine01 SNAPPY_RVV_0_7 + /* Define to 1 if you have and and want to optimize compression speed by using __crc32cw from . */ #cmakedefine01 SNAPPY_HAVE_NEON_CRC32 diff --git a/snappy-internal.h b/snappy-internal.h index 53a5110..1c9d043 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -46,7 +46,13 @@ #include #endif -#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON +#if SNAPPY_RVV_1 || SNAPPY_RVV_0_7 +#define SNAPPY_HAVE_RVV 1 +#include +#endif + + +#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON || SNAPPY_HAVE_RVV #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1 #else #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0 @@ -60,8 +66,23 @@ namespace internal { using V128 = __m128i; #elif SNAPPY_HAVE_NEON using V128 = uint8x16_t; +#elif SNAPPY_HAVE_RVV +using V128 = vuint8m1_t; #endif +#ifdef SNAPPY_RVV_1 +#define VSETVL_E8M1 __riscv_vsetvl_e8m1 +#define VLE8_V_U8M1 __riscv_vle8_v_u8m1 +#define VSE8_V_U8M1 __riscv_vse8_v_u8m1 +#define VRGATHER_VV_U8M1 __riscv_vrgather_vv_u8m1 +#define VMV_V_X_U8M1 __riscv_vmv_v_x_u8m1 +#elif SNAPPY_RVV_0_7 +#define VSETVL_E8M1 vsetvl_e8m1 +#define VLE8_V_U8M1 vle8_v_u8m1 +#define VSE8_V_U8M1 vse8_v_u8m1 +#define VRGATHER_VV_U8M1 vrgather_vv_u8m1 +#define VMV_V_X_U8M1 vmv_v_x_u8m1 +#endif // Load 128 bits of integer data. `src` must be 16-byte aligned. inline V128 V128_Load(const V128* src); @@ -110,6 +131,32 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { } inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); } + +#elif SNAPPY_HAVE_RVV +inline V128 V128_Load(const V128* src) { + size_t vl = VSETVL_E8M1(16); + return VLE8_V_U8M1(reinterpret_cast(src), vl); +} + +inline V128 V128_LoadU(const V128* src) { + size_t vl = VSETVL_E8M1(16); + return VLE8_V_U8M1(reinterpret_cast(src), vl); +} + +inline void V128_StoreU(V128* dst, V128 val) { + size_t vl = VSETVL_E8M1(16); + VSE8_V_U8M1(reinterpret_cast(dst), val, vl); +} + +inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { + size_t vl = VSETVL_E8M1(16); + return VRGATHER_VV_U8M1(input, shuffle_mask, vl); +} + +inline V128 V128_DupChar(char c) { + size_t vl = VSETVL_E8M1(16); + return VMV_V_X_U8M1(static_cast(c), vl); +} #endif #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE diff --git a/snappy.cc b/snappy.cc index 8dc3713..421b335 100644 --- a/snappy.cc +++ b/snappy.cc @@ -281,6 +281,20 @@ inline char* IncrementalCopySlow(const char* src, char* op, // 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by // calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and // MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively. + +// Selects the appropriate vector size based on the current architecture +// vuint8m1_t, RISC-V vector type with fixed 128-bit size +// (sizeof not used due to variable-length vector register in RVV) +#if defined(__SSE2__) || defined(SNAPPY_HAVE_SSSE3) +constexpr size_t kVectorSize = sizeof(V128); // __m128i +#elif defined(__ARM_NEON) || defined(SNAPPY_HAVE_NEON) +constexpr size_t kVectorSize = sizeof(uint8x16_t); // uint8x16_t +#elif defined(SNAPPY_HAVE_RVV) || defined(__riscv_vector) +constexpr size_t kVectorSize = 16; // vuint8m1_t +#else +#error "Unsupported architecture. Please define __SSE2__, __ARM_NEON, or SNAPPY_HAVE_RVV/__riscv_vector." +#endif + template inline constexpr std::array MakePatternMaskBytes( int index_offset, int pattern_size, index_sequence) { @@ -290,19 +304,17 @@ inline constexpr std::array MakePatternMaskBytes( // Computes the shuffle control mask bytes array for given pattern-sizes and // returns an array. template -inline constexpr std::array, +inline constexpr std::array, sizeof...(pattern_sizes_minus_one)> MakePatternMaskBytesTable(int index_offset, index_sequence) { - return { - MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1, - make_index_sequence())...}; + return {MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1, + make_index_sequence())...}; } - // This is an array of shuffle control masks that can be used as the source // operand for PSHUFB to permute the contents of the destination XMM register // into a repeating byte pattern. -alignas(16) constexpr std::array, +alignas(16) constexpr std::array, 16> pattern_generation_masks = MakePatternMaskBytesTable( /*index_offset=*/0, @@ -313,7 +325,7 @@ alignas(16) constexpr std::array, // Basically, pattern_reshuffle_masks is a continuation of // pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as // pattern_generation_masks for offsets 1, 2, 4, 8 and 16. -alignas(16) constexpr std::array, +alignas(16) constexpr std::array, 16> pattern_reshuffle_masks = MakePatternMaskBytesTable( /*index_offset=*/16, @@ -329,6 +341,21 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) { return V128_Shuffle(V128_LoadU(reinterpret_cast(src)), generation_mask); } +// vuint8m1_t cannot be used as an element of std::pair +#if SNAPPY_HAVE_RVV +#define LoadPatternAndReshuffleMask(src, pattern_size) \ + V128 pattern = LoadPattern(src, pattern_size);\ + V128 reshuffle_mask = V128_Load(reinterpret_cast(\ + pattern_reshuffle_masks[pattern_size - 1].data())); +#else + +// Suppress -Wignored-attributes warning for __m128i in x86 SSE2 environment +// warning: ignoring attributes on template argument 'snappy::internal::V128' {aka '__vector(2) long long int'} [-Wignored-attributes] +// This occurs because __m128i has vector attributes (e.g., __attribute__((vector_size(16)))) that are ignored in template parameters. +#ifdef __SSE2__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wignored-attributes" +#endif SNAPPY_ATTRIBUTE_ALWAYS_INLINE static inline std::pair @@ -345,7 +372,12 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { pattern_reshuffle_masks[pattern_size - 1].data())); return {pattern, reshuffle_mask}; } +// Restore original diagnostic state in x86 SSE2 environment +#ifdef __SSE2__ +#pragma GCC diagnostic pop +#endif +#endif #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE // Fallback for when we need to copy while extending the pattern, for example @@ -379,10 +411,14 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) { return true; } default: { + #if SNAPPY_HAVE_RVV + LoadPatternAndReshuffleMask(dst - offset, offset) + #else auto pattern_and_reshuffle_mask = LoadPatternAndReshuffleMask(dst - offset, offset); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; + #endif for (int i = 0; i < 4; i++) { V128_StoreU(reinterpret_cast(dst + 16 * i), pattern); pattern = V128_Shuffle(pattern, reshuffle_mask); @@ -490,11 +526,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, // Typically, the op_limit is the gating factor so try to simplify the loop // based on that. if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { + #if SNAPPY_HAVE_RVV + LoadPatternAndReshuffleMask(src, pattern_size); + #else auto pattern_and_reshuffle_mask = LoadPatternAndReshuffleMask(src, pattern_size); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; - + #endif // There is at least one, and at most four 16-byte blocks. Writing four // conditionals instead of a loop allows FDO to layout the code with // respect to the actual probabilities of each length. @@ -517,11 +556,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, } char* const op_end = buf_limit - 15; if (SNAPPY_PREDICT_TRUE(op < op_end)) { + #if SNAPPY_HAVE_RVV + LoadPatternAndReshuffleMask(src, pattern_size); + #else auto pattern_and_reshuffle_mask = LoadPatternAndReshuffleMask(src, pattern_size); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; - + #endif // This code path is relatively cold however so we save code size // by avoiding unrolling and vectorizing. // @@ -1247,13 +1289,41 @@ void MemCopy64(char* dst, const void* src, size_t size) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data); } #else - std::memmove(dst, src, kShortMemCopy); - // Profiling shows that nearly all copies are short. - if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { - std::memmove(dst + kShortMemCopy, - static_cast(src) + kShortMemCopy, - 64 - kShortMemCopy); +#ifdef SNAPPY_HAVE_RVV +uint8_t* dst_u8 = (uint8_t*)dst; +const uint8_t* src_u8 = (const uint8_t*)src; +if (src_u8 < dst_u8 && dst_u8 < src_u8 + size) { //overlap bwd copy + size_t offset = size; + while (offset > 0) { + size_t vl = VSETVL_E8M1(offset); + offset -= vl; + vuint8m1_t vec = VLE8_V_U8M1(src_u8 + offset, vl); + VSE8_V_U8M1(dst_u8 + offset, vec, vl); + } +} else { + size_t vl = VSETVL_E8M1(size); + if (vl < size) { // if size >vl,use the max_vlen copy + size_t offset = 0; + while (offset < size) { + vl = VSETVL_E8M1(size - offset); + vuint8m1_t vec = VLE8_V_U8M1(src_u8 + offset, vl); + VSE8_V_U8M1(dst_u8 + offset, vec, vl); + offset += vl; + } + } else { // copy the leaft + vuint8m1_t vec = VLE8_V_U8M1(src_u8, vl); + VSE8_V_U8M1(dst_u8, vec, vl); + } } +#else +std::memmove(dst, src, kShortMemCopy); + //Profiling shows that nearly all copies are short. +if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { + std::memmove(dst + kShortMemCopy, + static_cast(src) + kShortMemCopy, + 64 - kShortMemCopy);} + +#endif #endif } From 9fd2b72c77f4e9fb508868f1bb40b29250cb98a6 Mon Sep 17 00:00:00 2001 From: anthony-zy Date: Fri, 17 Oct 2025 10:28:17 +0800 Subject: [PATCH 4/4] fix by some comments --- snappy.cc | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/snappy.cc b/snappy.cc index 9a64563..fb93e0f 100644 --- a/snappy.cc +++ b/snappy.cc @@ -1261,22 +1261,22 @@ void MemCopy64(char* dst, const void* src, size_t size) { // RVV acceleration available on RISC-V when compiled with -march=rv64gcv #elif defined(__riscv) && SNAPPY_HAVE_RVV // Cast pointers to the type we will operate on. - unsigned char* dst_ptr = (unsigned char*)dst; - const unsigned char* src_ptr = (const unsigned char*)src; + unsigned char* dst_ptr = reinterpret_cast(dst); + const unsigned char* src_ptr = reinterpret_cast(src); size_t remaining_bytes = size; - //Loop as long as there are bytes remaining to be copied. + // Loop as long as there are bytes remaining to be copied. while (remaining_bytes > 0) { - //Set vector configuration: e8 (8-bit elements), m2 (LMUL=2). - //Use e8m2 configuration to maximize throughput. - size_t vl = VSETVL_E8M2(remaining_bytes); - //Load data from the current source pointer. - vuint8m2_t vec = VLE8_V_U8M2(src_ptr, vl); - //Store data to the current destination pointer. - VSE8_V_U8M2(dst_ptr, vec, vl); - //Update pointers and the remaining count. - src_ptr += vl; - dst_ptr += vl; - remaining_bytes -= vl; + // Set vector configuration: e8 (8-bit elements), m2 (LMUL=2). + // Use e8m2 configuration to maximize throughput. + size_t vl = VSETVL_E8M2(remaining_bytes); + // Load data from the current source pointer. + vuint8m2_t vec = VLE8_V_U8M2(src_ptr, vl); + // Store data to the current destination pointer. + VSE8_V_U8M2(dst_ptr, vec, vl); + // Update pointers and the remaining count. + src_ptr += vl; + dst_ptr += vl; + remaining_bytes -= vl; } #else @@ -1284,8 +1284,8 @@ void MemCopy64(char* dst, const void* src, size_t size) { //Profiling shows that nearly all copies are short. if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) { std::memmove(dst + kShortMemCopy, - static_cast(src) + kShortMemCopy, - 64 - kShortMemCopy);} + static_cast(src) + kShortMemCopy, + 64 - kShortMemCopy); #endif }