DPDK  24.07.0
rte_xxh64_avx512.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #ifndef RTE_XXH64_AVX512_H
6 #define RTE_XXH64_AVX512_H
7 
8 #ifdef __cplusplus
9 extern "C" {
10 #endif
11 
12 #include <rte_common.h>
13 #include <immintrin.h>
14 
15 /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
16 static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87ULL;
17 /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
18 static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;
19 /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
20 static const uint64_t PRIME64_3 = 0x165667B19E3779F9ULL;
21 /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
22 static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63ULL;
23 /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
24 static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5ULL;
25 
26 static __rte_always_inline __m512i
27 xxh64_round_avx512(__m512i hash, __m512i input)
28 {
29  hash = _mm512_madd52lo_epu64(hash,
30  input,
31  _mm512_set1_epi64(PRIME64_2));
32 
33  hash = _mm512_rol_epi64(hash, 31);
34 
35  return hash;
36 }
37 
38 static __rte_always_inline __m512i
39 xxh64_fmix_avx512(__m512i hash)
40 {
41  hash = _mm512_xor_si512(hash, _mm512_srli_epi64(hash, 33));
42 
43  return hash;
44 }
45 
46 static __rte_always_inline __m256i
47 rte_xxh64_sketch_avx512(const void *key, uint32_t key_len,
48  __m512i v_seed, uint32_t modulo)
49 {
50  __m512i v_prime64_5, v_hash;
51  size_t remaining = key_len;
52  size_t offset = 0;
53  __m512i input;
54 
55  v_prime64_5 = _mm512_set1_epi64(PRIME64_5);
56  v_hash = _mm512_add_epi64
57  (_mm512_add_epi64(v_seed, v_prime64_5),
58  _mm512_set1_epi64(key_len));
59 
60  while (remaining >= 8) {
61  input = _mm512_set1_epi64(*(uint64_t *)RTE_PTR_ADD(key, offset));
62  v_hash = _mm512_xor_epi64(v_hash,
63  xxh64_round_avx512(_mm512_setzero_si512(), input));
64  v_hash = _mm512_madd52lo_epu64(_mm512_set1_epi64(PRIME64_4),
65  v_hash,
66  _mm512_set1_epi64(PRIME64_1));
67 
68  remaining -= 8;
69  offset += 8;
70  }
71 
72  if (remaining >= 4) {
73  input = _mm512_set1_epi64
74  (*(uint32_t *)RTE_PTR_ADD(key, offset));
75  v_hash = _mm512_xor_epi64(v_hash,
76  _mm512_mullo_epi64(input,
77  _mm512_set1_epi64(PRIME64_1)));
78  v_hash = _mm512_madd52lo_epu64
79  (_mm512_set1_epi64(PRIME64_3),
80  _mm512_rol_epi64(v_hash, 23),
81  _mm512_set1_epi64(PRIME64_2));
82 
83  offset += 4;
84  remaining -= 4;
85  }
86 
87  while (remaining != 0) {
88  input = _mm512_set1_epi64
89  (*(uint8_t *)RTE_PTR_ADD(key, offset));
90  v_hash = _mm512_xor_epi64(v_hash,
91  _mm512_mullo_epi64(input,
92  _mm512_set1_epi64(PRIME64_5)));
93  v_hash = _mm512_mullo_epi64
94  (_mm512_rol_epi64(v_hash, 11),
95  _mm512_set1_epi64(PRIME64_1));
96  offset++;
97  remaining--;
98  }
99 
100  v_hash = xxh64_fmix_avx512(v_hash);
101 
102  /*
103  * theoritically, such modular operations can be replaced by
104  * _mm512_rem_epi64(), but seems it depends on the compiler's
105  * implementation. so here is the limitation that the modulo
106  * value should be power of 2.
107  */
108  __m512i v_hash_remainder = _mm512_set1_epi64((modulo - 1));
109 
110  return _mm512_cvtepi64_epi32(_mm512_and_si512(v_hash, v_hash_remainder));
111 }
112 
113 #ifdef __cplusplus
114 }
115 #endif
116 
117 #endif /* RTE_XXH64_AVX512_H */
#define __rte_always_inline
Definition: rte_common.h:370
#define RTE_PTR_ADD(ptr, x)
Definition: rte_common.h:410