Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
758804a
Use high-performance FindMatchLength to Optimize Snappy compression s…
anthony-zy Jul 29, 2025
1f6ba69
add RVV support and optimized uncompress speed
anthony-zy Aug 28, 2025
802ef73
add RVV support and optmized uncompress speed
anthony-zy Aug 28, 2025
0f989a2
Merge branch 'add_rvv_support' of https://github.com/anthony-zy/snapp…
anthony-zy Aug 28, 2025
eb8d19f
Merge branch 'add_rvv_support' of https://github.com/anthony-zy/snapp…
anthony-zy Aug 28, 2025
27d8915
Merge branch 'add_rvv_support' of https://github.com/anthony-zy/snapp…
anthony-zy Sep 5, 2025
4410326
Merge branch 'google:main' into add_rvv_support
anthony-zy Sep 5, 2025
c860cab
Merge branch 'google:main' into add_rvv_support
anthony-zy Sep 5, 2025
2b81365
Merge branch 'add_rvv_support' of https://github.com/anthony-zy/snapp…
anthony-zy Sep 12, 2025
c19d705
Merge branch 'add_rvv_support' of https://github.com/anthony-zy/snapp…
anthony-zy Sep 12, 2025
77a61a4
Merge branch 'add_rvv_optmized_memcopy64' of https://github.com/antho…
anthony-zy Oct 17, 2025
9fd2b72
fix by some comments
anthony-zy Oct 17, 2025
d457ac7
Merge branch 'add_rvv_optmized_memcopy64' of https://github.com/antho…
anthony-zy Oct 17, 2025
6ab5882
Merge branch 'add_rvv_optmized_memcopy64' of https://github.com/antho…
anthony-zy Oct 17, 2025
c098f68
Merge branch 'add_rvv_optmized_memcopy64' of https://github.com/antho…
anthony-zy Oct 17, 2025
9cc1596
Merge branch 'add_rvv_optmized_memcopy64' of https://github.com/antho…
anthony-zy Oct 17, 2025
b63569f
Merge branch 'add_rvv_optmized_memcopy64' of https://github.com/antho…
anthony-zy Oct 20, 2025
2d6c8f1
Merge branch 'add_rvv_optmized_memcopy64' of https://github.com/antho…
anthony-zy Oct 20, 2025
e92cb6a
Merge branch 'add_rvv_optmized_memcopy64' of https://github.com/antho…
anthony-zy Oct 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,31 @@ int main() {
return 0;
}" SNAPPY_HAVE_NEON)

#check RVV 1.0 need __riscv_ prefix
check_cxx_source_compiles("
#include <riscv_vector.h>
#include <stdint.h>
#include <stddef.h>
int main() {
uint8_t val = 3, dup[8];
size_t vl = __riscv_vsetvl_e8m1(8);
vuint8m1_t v = __riscv_vmv_v_x_u8m1(val, vl);
return 0;
}" SNAPPY_RVV_1)


#check RVV 0.7.1 not __riscv_ prefix
check_cxx_source_compiles("
#include <riscv_vector.h>
#include <stdint.h>
#include <stddef.h>
int main() {
uint8_t val = 3, dup[8];
size_t vl = vsetvl_e8m1(8);
vuint8m1_t v = vmv_v_x_u8m1(val, vl);
return 0;
}" SNAPPY_RVV_0_7)

include(CheckSymbolExists)
check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP)
check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF)
Expand Down
6 changes: 6 additions & 0 deletions cmake/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@
/* Define to 1 if you target processors with NEON and have <arm_neon.h>. */
#cmakedefine01 SNAPPY_HAVE_NEON

/* Define to 1 if you target processors with RVV1.0 and have <riscv_vector.h>. */
#cmakedefine01 SNAPPY_RVV_1

/* Define to 1 if you target processors with RVV0.7 and have <riscv_vector.h>. */
#cmakedefine01 SNAPPY_RVV_0_7

/* Define to 1 if you have <arm_neon.h> and <arm_acle.h> and want to optimize
compression speed by using __crc32cw from <arm_acle.h>. */
#cmakedefine01 SNAPPY_HAVE_NEON_CRC32
Expand Down
24 changes: 22 additions & 2 deletions snappy-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,24 @@
#include <arm_neon.h>
#endif

#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
#if SNAPPY_RVV_1 || SNAPPY_RVV_0_7
#define SNAPPY_HAVE_RVV 1
#include <riscv_vector.h>
#else
#define SNAPPY_HAVE_RVV 0
#endif

#ifdef SNAPPY_RVV_1
#define VSETVL_E8M2 __riscv_vsetvl_e8m2
#define VLE8_V_U8M2 __riscv_vle8_v_u8m2
#define VSE8_V_U8M2 __riscv_vse8_v_u8m2
#elif SNAPPY_RVV_0_7
#define VSETVL_E8M2 vsetvl_e8m2
#define VLE8_V_U8M2 vle8_v_u8m2
#define VSE8_V_U8M2 vse8_v_u8m2
#endif

#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
#else
#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
Expand All @@ -61,7 +78,7 @@ using V128 = __m128i;
#elif SNAPPY_HAVE_NEON
using V128 = uint8x16_t;
#endif

// Load 128 bits of integer data. `src` must be 16-byte aligned.
inline V128 V128_Load(const V128* src);

Expand Down Expand Up @@ -110,6 +127,8 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
}

inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }


#endif
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE

Expand Down Expand Up @@ -172,6 +191,7 @@ char* CompressFragment(const char* input,
// loading from s2 + n.
//
// Separate implementation for 64-bit, little-endian cpus.
// riscv and little-endian cpu choose this routinue can be done faster too.
#if !SNAPPY_IS_BIG_ENDIAN && \
(defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \
defined(ARCH_ARM) || defined(__riscv))
Expand Down
28 changes: 23 additions & 5 deletions snappy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ inline char* IncrementalCopySlow(const char* src, char* op,
// 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
// calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and
// MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively.


template <size_t... indexes>
inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
int index_offset, int pattern_size, index_sequence<indexes...>) {
Expand All @@ -298,7 +300,6 @@ MakePatternMaskBytesTable(int index_offset,
MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
make_index_sequence</*indexes=*/sizeof(V128)>())...};
}

// This is an array of shuffle control masks that can be used as the source
// operand for PSHUFB to permute the contents of the destination XMM register
// into a repeating byte pattern.
Expand Down Expand Up @@ -329,7 +330,6 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) {
return V128_Shuffle(V128_LoadU(reinterpret_cast<const V128*>(src)),
generation_mask);
}

SNAPPY_ATTRIBUTE_ALWAYS_INLINE
static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */>
LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
Expand All @@ -345,7 +345,6 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
pattern_reshuffle_masks[pattern_size - 1].data()));
return {pattern, reshuffle_mask};
}

#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE

// Fallback for when we need to copy while extending the pattern, for example
Expand Down Expand Up @@ -494,7 +493,6 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
LoadPatternAndReshuffleMask(src, pattern_size);
V128 pattern = pattern_and_reshuffle_mask.first;
V128 reshuffle_mask = pattern_and_reshuffle_mask.second;

// There is at least one, and at most four 16-byte blocks. Writing four
// conditionals instead of a loop allows FDO to layout the code with
// respect to the actual probabilities of each length.
Expand All @@ -521,7 +519,6 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
LoadPatternAndReshuffleMask(src, pattern_size);
V128 pattern = pattern_and_reshuffle_mask.first;
V128 reshuffle_mask = pattern_and_reshuffle_mask.second;

// This code path is relatively cold however so we save code size
// by avoiding unrolling and vectorizing.
//
Expand Down Expand Up @@ -1246,6 +1243,27 @@ void MemCopy64(char* dst, const void* src, size_t size) {
data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
}
// RVV acceleration available on RISC-V when compiled with -march=rv64gcv
#elif defined(__riscv) && SNAPPY_HAVE_RVV
// Cast pointers to the type we will operate on.
unsigned char* dst_ptr = reinterpret_cast<unsigned char*>(dst);
const unsigned char* src_ptr = reinterpret_cast<const unsigned char*>(src);
size_t remaining_bytes = size;
// Loop as long as there are bytes remaining to be copied.
while (remaining_bytes > 0) {
// Set vector configuration: e8 (8-bit elements), m2 (LMUL=2).
// Use e8m2 configuration to maximize throughput.
size_t vl = VSETVL_E8M2(remaining_bytes);
// Load data from the current source pointer.
vuint8m2_t vec = VLE8_V_U8M2(src_ptr, vl);
// Store data to the current destination pointer.
VSE8_V_U8M2(dst_ptr, vec, vl);
// Update pointers and the remaining count.
src_ptr += vl;
dst_ptr += vl;
remaining_bytes -= vl;
}

#else
std::memmove(dst, src, kShortMemCopy);
// Profiling shows that nearly all copies are short.
Expand Down