From 758804a4bc359fccb75318568023f62999e38caa Mon Sep 17 00:00:00 2001
From: anthony-zy <zheng.ziyang@zte.com.cn>
Date: Tue, 29 Jul 2025 14:08:43 +0800
Subject: [PATCH 1/4] Use high-performance FindMatchLength to Optimize Snappy
 compression speed For RISC-V[skip ci]

---
 snappy-internal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/snappy-internal.h b/snappy-internal.h
index ae78247..53a5110 100644
--- a/snappy-internal.h
+++ b/snappy-internal.h
@@ -172,9 +172,10 @@ char* CompressFragment(const char* input,
 // loading from s2 + n.
 //
 // Separate implementation for 64-bit, little-endian cpus.
+// riscv and little-endian cpu choose this routinue can be done faster too.
 #if !SNAPPY_IS_BIG_ENDIAN && \
     (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \
-     defined(ARCH_ARM))
+     defined(ARCH_ARM) || defined(__riscv))
 static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
                                                       const char* s2,
                                                       const char* s2_limit,

From 1f6ba69d6716136588836d241f9a0fb5e18c6b6f Mon Sep 17 00:00:00 2001
From: anthony-zy <zheng.ziyang@zte.com.cn>
Date: Thu, 28 Aug 2025 20:11:39 +0800
Subject: [PATCH 2/4] add RVV support and optimized uncompress speed

---
 CMakeLists.txt    |  25 ++++++++++++
 snappy-internal.h |  49 ++++++++++++++++++++++-
 snappy.cc         | 100 +++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 158 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd71a47..490f5b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,6 +216,31 @@ int main() {
   return 0;
 }" SNAPPY_HAVE_NEON)
 
+#check RVV 1.0 need __riscv_ prefix
+check_cxx_source_compiles("
+  #include <riscv_vector.h>
+  #include <stdint.h>
+  #include <stddef.h>
+  int main() {
+    uint8_t val = 3, dup[8];
+    size_t vl = __riscv_vsetvl_e8m1(8);
+    vuint8m1_t v = __riscv_vmv_v_x_u8m1(val, vl);
+    return 0;
+  }" SNAPPY_RVV_1)
+
+
+#check RVV 0.7.1 not  __riscv_ prefix
+check_cxx_source_compiles("
+  #include <riscv_vector.h>
+  #include <stdint.h>
+  #include <stddef.h>
+  int main() {
+    uint8_t val = 3, dup[8];
+    size_t vl = vsetvl_e8m1(8);
+    vuint8m1_t v = vmv_v_x_u8m1(val, vl);
+    return 0;
+  }" SNAPPY_RVV_0_7)
+
 include(CheckSymbolExists)
 check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP)
 check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF)
diff --git a/snappy-internal.h b/snappy-internal.h
index 53a5110..1c9d043 100644
--- a/snappy-internal.h
+++ b/snappy-internal.h
@@ -46,7 +46,13 @@
 #include <arm_neon.h>
 #endif
 
-#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
+#if SNAPPY_RVV_1 || SNAPPY_RVV_0_7
+#define SNAPPY_HAVE_RVV 1
+#include <riscv_vector.h>
+#endif
+
+
+#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON || SNAPPY_HAVE_RVV
 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
 #else
 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
@@ -60,8 +66,23 @@ namespace internal {
 using V128 = __m128i;
 #elif SNAPPY_HAVE_NEON
 using V128 = uint8x16_t;
+#elif SNAPPY_HAVE_RVV
+using V128 = vuint8m1_t;
 #endif
 
+#ifdef SNAPPY_RVV_1
+#define VSETVL_E8M1 __riscv_vsetvl_e8m1
+#define VLE8_V_U8M1 __riscv_vle8_v_u8m1
+#define VSE8_V_U8M1 __riscv_vse8_v_u8m1
+#define VRGATHER_VV_U8M1 __riscv_vrgather_vv_u8m1
+#define VMV_V_X_U8M1 __riscv_vmv_v_x_u8m1
+#elif SNAPPY_RVV_0_7
+#define VSETVL_E8M1 vsetvl_e8m1
+#define VLE8_V_U8M1 vle8_v_u8m1
+#define VSE8_V_U8M1 vse8_v_u8m1
+#define VRGATHER_VV_U8M1 vrgather_vv_u8m1
+#define VMV_V_X_U8M1 vmv_v_x_u8m1
+#endif
 // Load 128 bits of integer data. `src` must be 16-byte aligned.
 inline V128 V128_Load(const V128* src);
 
@@ -110,6 +131,32 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
 }
 
 inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
+
+#elif SNAPPY_HAVE_RVV
+inline V128 V128_Load(const V128* src) {
+  size_t vl = VSETVL_E8M1(16); 
+  return VLE8_V_U8M1(reinterpret_cast<const uint8_t*>(src), vl);
+}
+
+inline V128 V128_LoadU(const V128* src) {
+  size_t vl = VSETVL_E8M1(16); 
+  return VLE8_V_U8M1(reinterpret_cast<const uint8_t*>(src), vl);
+}
+
+inline void V128_StoreU(V128* dst, V128 val) {
+  size_t vl = VSETVL_E8M1(16);
+  VSE8_V_U8M1(reinterpret_cast<uint8_t*>(dst), val, vl);
+}
+
+inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
+  size_t vl = VSETVL_E8M1(16);
+  return VRGATHER_VV_U8M1(input, shuffle_mask, vl);
+}
+
+inline V128 V128_DupChar(char c) {
+  size_t vl = VSETVL_E8M1(16);
+  return VMV_V_X_U8M1(static_cast<uint8_t>(c), vl);
+}
 #endif
 #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
 
diff --git a/snappy.cc b/snappy.cc
index 8dc3713..421b335 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -281,6 +281,20 @@ inline char* IncrementalCopySlow(const char* src, char* op,
 // 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
 // calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and
 // MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.
+
+// Selects the appropriate vector size based on the current architecture
+// vuint8m1_t, RISC-V vector type with fixed 128-bit size 
+// (sizeof not used due to variable-length vector register in RVV)
+#if defined(__SSE2__) || defined(SNAPPY_HAVE_SSSE3) 
+constexpr size_t kVectorSize = sizeof(V128); // __m128i
+#elif defined(__ARM_NEON) || defined(SNAPPY_HAVE_NEON) 
+constexpr size_t kVectorSize = sizeof(uint8x16_t); // uint8x16_t
+#elif defined(SNAPPY_HAVE_RVV) || defined(__riscv_vector)
+constexpr size_t kVectorSize = 16; // vuint8m1_t
+#else
+#error "Unsupported architecture. Please define __SSE2__, __ARM_NEON, or SNAPPY_HAVE_RVV/__riscv_vector."
+#endif
+
 template <size_t... indexes>
 inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
     int index_offset, int pattern_size, index_sequence<indexes...>) {
@@ -290,19 +304,17 @@ inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
 // Computes the shuffle control mask bytes array for given pattern-sizes and
 // returns an array.
 template <size_t... pattern_sizes_minus_one>
-inline constexpr std::array<std::array<char, sizeof(V128)>,
+inline constexpr std::array<std::array<char, kVectorSize>,
                             sizeof...(pattern_sizes_minus_one)>
 MakePatternMaskBytesTable(int index_offset,
                           index_sequence<pattern_sizes_minus_one...>) {
-  return {
-      MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
-                           make_index_sequence</*indexes=*/sizeof(V128)>())...};
+  return {MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
+                               make_index_sequence<kVectorSize>())...};
 }
-
 // This is an array of shuffle control masks that can be used as the source
 // operand for PSHUFB to permute the contents of the destination XMM register
 // into a repeating byte pattern.
-alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
+alignas(16) constexpr std::array<std::array<char, kVectorSize>,
                                  16> pattern_generation_masks =
     MakePatternMaskBytesTable(
         /*index_offset=*/0,
@@ -313,7 +325,7 @@ alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
 // Basically, pattern_reshuffle_masks is a continuation of
 // pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
 // pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
-alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
+alignas(16) constexpr std::array<std::array<char, kVectorSize>,
                                  16> pattern_reshuffle_masks =
     MakePatternMaskBytesTable(
         /*index_offset=*/16,
@@ -329,6 +341,21 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) {
   return V128_Shuffle(V128_LoadU(reinterpret_cast<const V128*>(src)),
                       generation_mask);
 }
+// vuint8m1_t cannot be used as an element of std::pair
+#if SNAPPY_HAVE_RVV
+#define LoadPatternAndReshuffleMask(src, pattern_size) \
+  V128 pattern = LoadPattern(src, pattern_size);\
+  V128 reshuffle_mask = V128_Load(reinterpret_cast<const V128*>(\
+      pattern_reshuffle_masks[pattern_size - 1].data()));
+#else
+
+// Suppress -Wignored-attributes warning for __m128i in x86 SSE2 environment
+// warning: ignoring attributes on template argument 'snappy::internal::V128' {aka '__vector(2) long long int'} [-Wignored-attributes]
+// This occurs because __m128i has vector attributes (e.g., __attribute__((vector_size(16)))) that are ignored in template parameters.
+#ifdef __SSE2__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
 
 SNAPPY_ATTRIBUTE_ALWAYS_INLINE
 static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */>
@@ -345,7 +372,12 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
       pattern_reshuffle_masks[pattern_size - 1].data()));
   return {pattern, reshuffle_mask};
 }
+// Restore original diagnostic state in x86 SSE2 environment
+#ifdef __SSE2__
+#pragma GCC diagnostic pop
+#endif
 
+#endif
 #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
 
 // Fallback for when we need to copy while extending the pattern, for example
@@ -379,10 +411,14 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
         return true;
       }
       default: {
+        #if SNAPPY_HAVE_RVV
+        LoadPatternAndReshuffleMask(dst - offset, offset)
+        #else
         auto pattern_and_reshuffle_mask =
             LoadPatternAndReshuffleMask(dst - offset, offset);
         V128 pattern = pattern_and_reshuffle_mask.first;
         V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
+        #endif
         for (int i = 0; i < 4; i++) {
           V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
           pattern = V128_Shuffle(pattern, reshuffle_mask);
@@ -490,11 +526,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     // Typically, the op_limit is the gating factor so try to simplify the loop
     // based on that.
     if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
+      #if SNAPPY_HAVE_RVV
+      LoadPatternAndReshuffleMask(src, pattern_size);
+      #else
       auto pattern_and_reshuffle_mask =
           LoadPatternAndReshuffleMask(src, pattern_size);
       V128 pattern = pattern_and_reshuffle_mask.first;
       V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
-
+      #endif
       // There is at least one, and at most four 16-byte blocks. Writing four
       // conditionals instead of a loop allows FDO to layout the code with
       // respect to the actual probabilities of each length.
@@ -517,11 +556,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     }
     char* const op_end = buf_limit - 15;
     if (SNAPPY_PREDICT_TRUE(op < op_end)) {
+      #if SNAPPY_HAVE_RVV
+      LoadPatternAndReshuffleMask(src, pattern_size);
+      #else
       auto pattern_and_reshuffle_mask =
           LoadPatternAndReshuffleMask(src, pattern_size);
       V128 pattern = pattern_and_reshuffle_mask.first;
       V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
-
+      #endif
       // This code path is relatively cold however so we save code size
       // by avoiding unrolling and vectorizing.
       //
@@ -1247,13 +1289,41 @@ void MemCopy64(char* dst, const void* src, size_t size) {
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
   }
 #else
-  std::memmove(dst, src, kShortMemCopy);
-  // Profiling shows that nearly all copies are short.
-  if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
-    std::memmove(dst + kShortMemCopy,
-                 static_cast<const uint8_t*>(src) + kShortMemCopy,
-                 64 - kShortMemCopy);
+#ifdef SNAPPY_HAVE_RVV
+uint8_t* dst_u8 = (uint8_t*)dst;
+const uint8_t* src_u8 = (const uint8_t*)src;
+if (src_u8 < dst_u8 && dst_u8 < src_u8 + size) {  //overlap bwd copy
+  size_t offset = size;
+  while (offset > 0) {
+    size_t vl = VSETVL_E8M1(offset);
+    offset -= vl;
+    vuint8m1_t vec = VLE8_V_U8M1(src_u8 + offset, vl);
+    VSE8_V_U8M1(dst_u8 + offset, vec, vl);
+  }
+} else {
+  size_t vl = VSETVL_E8M1(size);
+    if (vl < size) {  // if size >vl,use the max_vlen copy
+      size_t offset = 0;
+      while (offset < size) {
+        vl = VSETVL_E8M1(size - offset);
+        vuint8m1_t vec = VLE8_V_U8M1(src_u8 + offset, vl);  
+        VSE8_V_U8M1(dst_u8 + offset, vec, vl);
+        offset += vl;
+              }
+    } else {  // copy the leaft
+      vuint8m1_t vec = VLE8_V_U8M1(src_u8, vl);
+      VSE8_V_U8M1(dst_u8, vec, vl);
+    }
   }
+#else
+std::memmove(dst, src, kShortMemCopy);
+  //Profiling shows that nearly all copies are short.
+if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
+  std::memmove(dst + kShortMemCopy,
+                static_cast<const uint8_t*>(src) + kShortMemCopy,
+                64 - kShortMemCopy);}
+
+#endif
 #endif
 }
 

From 802ef73aed1be6bf62447a587f459ad067914b4d Mon Sep 17 00:00:00 2001
From: anthony-zy <zheng.ziyang@zte.com.cn>
Date: Thu, 28 Aug 2025 20:42:13 +0800
Subject: [PATCH 3/4] add RVV support and optmized uncompress speed

---
 CMakeLists.txt    |  25 ++++++++++++
 cmake/config.h.in |   6 +++
 snappy-internal.h |  49 ++++++++++++++++++++++-
 snappy.cc         | 100 +++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 164 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd71a47..490f5b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,6 +216,31 @@ int main() {
   return 0;
 }" SNAPPY_HAVE_NEON)
 
+#check RVV 1.0 need __riscv_ prefix
+check_cxx_source_compiles("
+  #include <riscv_vector.h>
+  #include <stdint.h>
+  #include <stddef.h>
+  int main() {
+    uint8_t val = 3, dup[8];
+    size_t vl = __riscv_vsetvl_e8m1(8);
+    vuint8m1_t v = __riscv_vmv_v_x_u8m1(val, vl);
+    return 0;
+  }" SNAPPY_RVV_1)
+
+
+#check RVV 0.7.1 not  __riscv_ prefix
+check_cxx_source_compiles("
+  #include <riscv_vector.h>
+  #include <stdint.h>
+  #include <stddef.h>
+  int main() {
+    uint8_t val = 3, dup[8];
+    size_t vl = vsetvl_e8m1(8);
+    vuint8m1_t v = vmv_v_x_u8m1(val, vl);
+    return 0;
+  }" SNAPPY_RVV_0_7)
+
 include(CheckSymbolExists)
 check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP)
 check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF)
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 3510c27..de80c5f 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -58,6 +58,12 @@
 /* Define to 1 if you target processors with NEON and have <arm_neon.h>. */
 #cmakedefine01 SNAPPY_HAVE_NEON
 
+/* Define to 1 if you target processors with RVV1.0 and have <riscv_vector.h>. */
+#cmakedefine01 SNAPPY_RVV_1
+
+/* Define to 1 if you target processors with RVV0.7 and have <riscv_vector.h>. */
+#cmakedefine01 SNAPPY_RVV_0_7
+
 /* Define to 1 if you have <arm_neon.h> and <arm_acle.h> and want to optimize
    compression speed by using __crc32cw from <arm_acle.h>. */
 #cmakedefine01 SNAPPY_HAVE_NEON_CRC32
diff --git a/snappy-internal.h b/snappy-internal.h
index 53a5110..1c9d043 100644
--- a/snappy-internal.h
+++ b/snappy-internal.h
@@ -46,7 +46,13 @@
 #include <arm_neon.h>
 #endif
 
-#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
+#if SNAPPY_RVV_1 || SNAPPY_RVV_0_7
+#define SNAPPY_HAVE_RVV 1
+#include <riscv_vector.h>
+#endif
+
+
+#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON || SNAPPY_HAVE_RVV
 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
 #else
 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
@@ -60,8 +66,23 @@ namespace internal {
 using V128 = __m128i;
 #elif SNAPPY_HAVE_NEON
 using V128 = uint8x16_t;
+#elif SNAPPY_HAVE_RVV
+using V128 = vuint8m1_t;
 #endif
 
+#ifdef SNAPPY_RVV_1
+#define VSETVL_E8M1 __riscv_vsetvl_e8m1
+#define VLE8_V_U8M1 __riscv_vle8_v_u8m1
+#define VSE8_V_U8M1 __riscv_vse8_v_u8m1
+#define VRGATHER_VV_U8M1 __riscv_vrgather_vv_u8m1
+#define VMV_V_X_U8M1 __riscv_vmv_v_x_u8m1
+#elif SNAPPY_RVV_0_7
+#define VSETVL_E8M1 vsetvl_e8m1
+#define VLE8_V_U8M1 vle8_v_u8m1
+#define VSE8_V_U8M1 vse8_v_u8m1
+#define VRGATHER_VV_U8M1 vrgather_vv_u8m1
+#define VMV_V_X_U8M1 vmv_v_x_u8m1
+#endif
 // Load 128 bits of integer data. `src` must be 16-byte aligned.
 inline V128 V128_Load(const V128* src);
 
@@ -110,6 +131,32 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
 }
 
 inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
+
+#elif SNAPPY_HAVE_RVV
+inline V128 V128_Load(const V128* src) {
+  size_t vl = VSETVL_E8M1(16); 
+  return VLE8_V_U8M1(reinterpret_cast<const uint8_t*>(src), vl);
+}
+
+inline V128 V128_LoadU(const V128* src) {
+  size_t vl = VSETVL_E8M1(16); 
+  return VLE8_V_U8M1(reinterpret_cast<const uint8_t*>(src), vl);
+}
+
+inline void V128_StoreU(V128* dst, V128 val) {
+  size_t vl = VSETVL_E8M1(16);
+  VSE8_V_U8M1(reinterpret_cast<uint8_t*>(dst), val, vl);
+}
+
+inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
+  size_t vl = VSETVL_E8M1(16);
+  return VRGATHER_VV_U8M1(input, shuffle_mask, vl);
+}
+
+inline V128 V128_DupChar(char c) {
+  size_t vl = VSETVL_E8M1(16);
+  return VMV_V_X_U8M1(static_cast<uint8_t>(c), vl);
+}
 #endif
 #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
 
diff --git a/snappy.cc b/snappy.cc
index 8dc3713..421b335 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -281,6 +281,20 @@ inline char* IncrementalCopySlow(const char* src, char* op,
 // 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
 // calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and
 // MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.
+
+// Selects the appropriate vector size based on the current architecture
+// vuint8m1_t, RISC-V vector type with fixed 128-bit size 
+// (sizeof not used due to variable-length vector register in RVV)
+#if defined(__SSE2__) || defined(SNAPPY_HAVE_SSSE3) 
+constexpr size_t kVectorSize = sizeof(V128); // __m128i
+#elif defined(__ARM_NEON) || defined(SNAPPY_HAVE_NEON) 
+constexpr size_t kVectorSize = sizeof(uint8x16_t); // uint8x16_t
+#elif defined(SNAPPY_HAVE_RVV) || defined(__riscv_vector)
+constexpr size_t kVectorSize = 16; // vuint8m1_t
+#else
+#error "Unsupported architecture. Please define __SSE2__, __ARM_NEON, or SNAPPY_HAVE_RVV/__riscv_vector."
+#endif
+
 template <size_t... indexes>
 inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
     int index_offset, int pattern_size, index_sequence<indexes...>) {
@@ -290,19 +304,17 @@ inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
 // Computes the shuffle control mask bytes array for given pattern-sizes and
 // returns an array.
 template <size_t... pattern_sizes_minus_one>
-inline constexpr std::array<std::array<char, sizeof(V128)>,
+inline constexpr std::array<std::array<char, kVectorSize>,
                             sizeof...(pattern_sizes_minus_one)>
 MakePatternMaskBytesTable(int index_offset,
                           index_sequence<pattern_sizes_minus_one...>) {
-  return {
-      MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
-                           make_index_sequence</*indexes=*/sizeof(V128)>())...};
+  return {MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
+                               make_index_sequence<kVectorSize>())...};
 }
-
 // This is an array of shuffle control masks that can be used as the source
 // operand for PSHUFB to permute the contents of the destination XMM register
 // into a repeating byte pattern.
-alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
+alignas(16) constexpr std::array<std::array<char, kVectorSize>,
                                  16> pattern_generation_masks =
     MakePatternMaskBytesTable(
         /*index_offset=*/0,
@@ -313,7 +325,7 @@ alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
 // Basically, pattern_reshuffle_masks is a continuation of
 // pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
 // pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
-alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
+alignas(16) constexpr std::array<std::array<char, kVectorSize>,
                                  16> pattern_reshuffle_masks =
     MakePatternMaskBytesTable(
         /*index_offset=*/16,
@@ -329,6 +341,21 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) {
   return V128_Shuffle(V128_LoadU(reinterpret_cast<const V128*>(src)),
                       generation_mask);
 }
+// vuint8m1_t cannot be used as an element of std::pair
+#if SNAPPY_HAVE_RVV
+#define LoadPatternAndReshuffleMask(src, pattern_size) \
+  V128 pattern = LoadPattern(src, pattern_size);\
+  V128 reshuffle_mask = V128_Load(reinterpret_cast<const V128*>(\
+      pattern_reshuffle_masks[pattern_size - 1].data()));
+#else
+
+// Suppress -Wignored-attributes warning for __m128i in x86 SSE2 environment
+// warning: ignoring attributes on template argument 'snappy::internal::V128' {aka '__vector(2) long long int'} [-Wignored-attributes]
+// This occurs because __m128i has vector attributes (e.g., __attribute__((vector_size(16)))) that are ignored in template parameters.
+#ifdef __SSE2__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
 
 SNAPPY_ATTRIBUTE_ALWAYS_INLINE
 static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */>
@@ -345,7 +372,12 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
       pattern_reshuffle_masks[pattern_size - 1].data()));
   return {pattern, reshuffle_mask};
 }
+// Restore original diagnostic state in x86 SSE2 environment
+#ifdef __SSE2__
+#pragma GCC diagnostic pop
+#endif
 
+#endif
 #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
 
 // Fallback for when we need to copy while extending the pattern, for example
@@ -379,10 +411,14 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
         return true;
       }
       default: {
+        #if SNAPPY_HAVE_RVV
+        LoadPatternAndReshuffleMask(dst - offset, offset)
+        #else
         auto pattern_and_reshuffle_mask =
             LoadPatternAndReshuffleMask(dst - offset, offset);
         V128 pattern = pattern_and_reshuffle_mask.first;
         V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
+        #endif
         for (int i = 0; i < 4; i++) {
           V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
           pattern = V128_Shuffle(pattern, reshuffle_mask);
@@ -490,11 +526,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     // Typically, the op_limit is the gating factor so try to simplify the loop
     // based on that.
     if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
+      #if SNAPPY_HAVE_RVV
+      LoadPatternAndReshuffleMask(src, pattern_size);
+      #else
       auto pattern_and_reshuffle_mask =
           LoadPatternAndReshuffleMask(src, pattern_size);
       V128 pattern = pattern_and_reshuffle_mask.first;
       V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
-
+      #endif
       // There is at least one, and at most four 16-byte blocks. Writing four
       // conditionals instead of a loop allows FDO to layout the code with
       // respect to the actual probabilities of each length.
@@ -517,11 +556,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     }
     char* const op_end = buf_limit - 15;
     if (SNAPPY_PREDICT_TRUE(op < op_end)) {
+      #if SNAPPY_HAVE_RVV
+      LoadPatternAndReshuffleMask(src, pattern_size);
+      #else
       auto pattern_and_reshuffle_mask =
           LoadPatternAndReshuffleMask(src, pattern_size);
       V128 pattern = pattern_and_reshuffle_mask.first;
       V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
-
+      #endif
       // This code path is relatively cold however so we save code size
       // by avoiding unrolling and vectorizing.
       //
@@ -1247,13 +1289,41 @@ void MemCopy64(char* dst, const void* src, size_t size) {
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
   }
 #else
-  std::memmove(dst, src, kShortMemCopy);
-  // Profiling shows that nearly all copies are short.
-  if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
-    std::memmove(dst + kShortMemCopy,
-                 static_cast<const uint8_t*>(src) + kShortMemCopy,
-                 64 - kShortMemCopy);
+#ifdef SNAPPY_HAVE_RVV
+uint8_t* dst_u8 = (uint8_t*)dst;
+const uint8_t* src_u8 = (const uint8_t*)src;
+if (src_u8 < dst_u8 && dst_u8 < src_u8 + size) {  //overlap bwd copy
+  size_t offset = size;
+  while (offset > 0) {
+    size_t vl = VSETVL_E8M1(offset);
+    offset -= vl;
+    vuint8m1_t vec = VLE8_V_U8M1(src_u8 + offset, vl);
+    VSE8_V_U8M1(dst_u8 + offset, vec, vl);
+  }
+} else {
+  size_t vl = VSETVL_E8M1(size);
+    if (vl < size) {  // if size >vl,use the max_vlen copy
+      size_t offset = 0;
+      while (offset < size) {
+        vl = VSETVL_E8M1(size - offset);
+        vuint8m1_t vec = VLE8_V_U8M1(src_u8 + offset, vl);  
+        VSE8_V_U8M1(dst_u8 + offset, vec, vl);
+        offset += vl;
+              }
+    } else {  // copy the leaft
+      vuint8m1_t vec = VLE8_V_U8M1(src_u8, vl);
+      VSE8_V_U8M1(dst_u8, vec, vl);
+    }
   }
+#else
+std::memmove(dst, src, kShortMemCopy);
+  //Profiling shows that nearly all copies are short.
+if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
+  std::memmove(dst + kShortMemCopy,
+                static_cast<const uint8_t*>(src) + kShortMemCopy,
+                64 - kShortMemCopy);}
+
+#endif
 #endif
 }
 

From 9fd2b72c77f4e9fb508868f1bb40b29250cb98a6 Mon Sep 17 00:00:00 2001
From: anthony-zy <zheng.ziyang@zte.com.cn>
Date: Fri, 17 Oct 2025 10:28:17 +0800
Subject: [PATCH 4/4] fix by some comments

---
 snappy.cc | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/snappy.cc b/snappy.cc
index 9a64563..fb93e0f 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -1261,22 +1261,22 @@ void MemCopy64(char* dst, const void* src, size_t size) {
   // RVV acceleration available on RISC-V when compiled with -march=rv64gcv
 #elif defined(__riscv) && SNAPPY_HAVE_RVV
   // Cast pointers to the type we will operate on.
-  unsigned char* dst_ptr = (unsigned char*)dst;
-  const unsigned char* src_ptr = (const unsigned char*)src;
+  unsigned char* dst_ptr = reinterpret_cast<unsigned char*>(dst);
+  const unsigned char* src_ptr = reinterpret_cast<const unsigned char*>(src);
   size_t remaining_bytes = size;
-  //Loop as long as there are bytes remaining to be copied.
+  // Loop as long as there are bytes remaining to be copied.
   while (remaining_bytes > 0) {
-    //Set vector configuration: e8 (8-bit elements), m2 (LMUL=2).
-    //Use e8m2 configuration to maximize throughput.
-  size_t vl = VSETVL_E8M2(remaining_bytes);
-    //Load data from the current source pointer.
-  vuint8m2_t vec = VLE8_V_U8M2(src_ptr, vl);
-    //Store data to the current destination pointer.
-  VSE8_V_U8M2(dst_ptr, vec, vl);
-    //Update pointers and the remaining count.
-  src_ptr += vl;
-  dst_ptr += vl;
-  remaining_bytes -= vl;
+    // Set vector configuration: e8 (8-bit elements), m2 (LMUL=2).
+    // Use e8m2 configuration to maximize throughput.
+    size_t vl = VSETVL_E8M2(remaining_bytes);
+    // Load data from the current source pointer.
+    vuint8m2_t vec = VLE8_V_U8M2(src_ptr, vl);
+      // Store data to the current destination pointer.
+    VSE8_V_U8M2(dst_ptr, vec, vl);
+      // Update pointers and the remaining count.
+    src_ptr += vl;
+    dst_ptr += vl;
+    remaining_bytes -= vl;
   }
 
 #else
@@ -1284,8 +1284,8 @@ void MemCopy64(char* dst, const void* src, size_t size) {
     //Profiling shows that nearly all copies are short.
   if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
     std::memmove(dst + kShortMemCopy,
-                  static_cast<const uint8_t*>(src) + kShortMemCopy,
-                  64 - kShortMemCopy);}
+                 static_cast<const uint8_t*>(src) + kShortMemCopy,
+                 64 - kShortMemCopy);
 #endif
 }