diff --git a/CMakeLists.txt b/CMakeLists.txt index cd71a47..490f5b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -216,6 +216,31 @@ int main() { return 0; }" SNAPPY_HAVE_NEON) +#check RVV 1.0 need __riscv_ prefix +check_cxx_source_compiles(" + #include + #include + #include + int main() { + uint8_t val = 3, dup[8]; + size_t vl = __riscv_vsetvl_e8m1(8); + vuint8m1_t v = __riscv_vmv_v_x_u8m1(val, vl); + return 0; + }" SNAPPY_RVV_1) + + +#check RVV 0.7.1 not __riscv_ prefix +check_cxx_source_compiles(" + #include + #include + #include + int main() { + uint8_t val = 3, dup[8]; + size_t vl = vsetvl_e8m1(8); + vuint8m1_t v = vmv_v_x_u8m1(val, vl); + return 0; + }" SNAPPY_RVV_0_7) + include(CheckSymbolExists) check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP) check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF) diff --git a/cmake/config.h.in b/cmake/config.h.in index 3510c27..de80c5f 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -58,6 +58,12 @@ /* Define to 1 if you target processors with NEON and have . */ #cmakedefine01 SNAPPY_HAVE_NEON +/* Define to 1 if you target processors with RVV1.0 and have . */ +#cmakedefine01 SNAPPY_RVV_1 + +/* Define to 1 if you target processors with RVV0.7 and have . */ +#cmakedefine01 SNAPPY_RVV_0_7 + /* Define to 1 if you have and and want to optimize compression speed by using __crc32cw from . */ #cmakedefine01 SNAPPY_HAVE_NEON_CRC32 diff --git a/snappy-internal.h b/snappy-internal.h index 0863101..00b2db5 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -46,7 +46,24 @@ #include #endif -#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON +#if SNAPPY_RVV_1 || SNAPPY_RVV_0_7 +#define SNAPPY_HAVE_RVV 1 +#include +#else +#define SNAPPY_HAVE_RVV 0 +#endif + +#ifdef SNAPPY_RVV_1 +#define VSETVL_E8M2 __riscv_vsetvl_e8m2 +#define VLE8_V_U8M2 __riscv_vle8_v_u8m2 +#define VSE8_V_U8M2 __riscv_vse8_v_u8m2 +#elif SNAPPY_RVV_0_7 +#define VSETVL_E8M2 vsetvl_e8m2 +#define VLE8_V_U8M2 vle8_v_u8m2 +#define VSE8_V_U8M2 vse8_v_u8m2 +#endif + +#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1 #else #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0 @@ -61,7 +78,7 @@ using V128 = __m128i; #elif SNAPPY_HAVE_NEON using V128 = uint8x16_t; #endif - + // Load 128 bits of integer data. `src` must be 16-byte aligned. inline V128 V128_Load(const V128* src); @@ -110,6 +127,8 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { } inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); } + + #endif #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE @@ -172,6 +191,7 @@ char* CompressFragment(const char* input, // loading from s2 + n. // // Separate implementation for 64-bit, little-endian cpus. +// riscv and little-endian cpu choose this routinue can be done faster too. #if !SNAPPY_IS_BIG_ENDIAN && \ (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \ defined(ARCH_ARM) || defined(__riscv)) diff --git a/snappy.cc b/snappy.cc index 8dc3713..832455b 100644 --- a/snappy.cc +++ b/snappy.cc @@ -281,6 +281,8 @@ inline char* IncrementalCopySlow(const char* src, char* op, // 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by // calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and // MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively. + + template inline constexpr std::array MakePatternMaskBytes( int index_offset, int pattern_size, index_sequence) { @@ -298,7 +300,6 @@ MakePatternMaskBytesTable(int index_offset, MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1, make_index_sequence())...}; } - // This is an array of shuffle control masks that can be used as the source // operand for PSHUFB to permute the contents of the destination XMM register // into a repeating byte pattern. @@ -329,7 +330,6 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) { return V128_Shuffle(V128_LoadU(reinterpret_cast(src)), generation_mask); } - SNAPPY_ATTRIBUTE_ALWAYS_INLINE static inline std::pair LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { @@ -345,7 +345,6 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { pattern_reshuffle_masks[pattern_size - 1].data())); return {pattern, reshuffle_mask}; } - #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE // Fallback for when we need to copy while extending the pattern, for example @@ -494,7 +493,6 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, LoadPatternAndReshuffleMask(src, pattern_size); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; - // There is at least one, and at most four 16-byte blocks. Writing four // conditionals instead of a loop allows FDO to layout the code with // respect to the actual probabilities of each length. @@ -521,7 +519,6 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, LoadPatternAndReshuffleMask(src, pattern_size); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; - // This code path is relatively cold however so we save code size // by avoiding unrolling and vectorizing. // @@ -1246,6 +1243,27 @@ void MemCopy64(char* dst, const void* src, size_t size) { data = _mm256_lddqu_si256(static_cast(src) + 1); _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data); } + // RVV acceleration available on RISC-V when compiled with -march=rv64gcv +#elif defined(__riscv) && SNAPPY_HAVE_RVV + // Cast pointers to the type we will operate on. + unsigned char* dst_ptr = reinterpret_cast(dst); + const unsigned char* src_ptr = reinterpret_cast(src); + size_t remaining_bytes = size; + // Loop as long as there are bytes remaining to be copied. + while (remaining_bytes > 0) { + // Set vector configuration: e8 (8-bit elements), m2 (LMUL=2). + // Use e8m2 configuration to maximize throughput. + size_t vl = VSETVL_E8M2(remaining_bytes); + // Load data from the current source pointer. + vuint8m2_t vec = VLE8_V_U8M2(src_ptr, vl); + // Store data to the current destination pointer. + VSE8_V_U8M2(dst_ptr, vec, vl); + // Update pointers and the remaining count. + src_ptr += vl; + dst_ptr += vl; + remaining_bytes -= vl; + } + #else std::memmove(dst, src, kShortMemCopy); // Profiling shows that nearly all copies are short.