blob: 24326bc33f44cb68012249f7233267aa2b6ca4c3 [file] [log] [blame]
Willy Tarreaub5684e02015-04-27 11:59:40 +02001/*
Dragan Dosende374432020-12-22 12:00:37 +01002 * xxHash - Extremely Fast Hash algorithm
3 * Header File
4 * Copyright (C) 2012-2020 Yann Collet
5 *
6 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are
10 * met:
11 *
12 * * Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * * Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following disclaimer
16 * in the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * You can contact the author at:
32 * - xxHash homepage: https://www.xxhash.com
33 * - xxHash source repository: https://github.com/Cyan4973/xxHash
34 */
Willy Tarreaub5684e02015-04-27 11:59:40 +020035
Dragan Dosende374432020-12-22 12:00:37 +010036/* TODO: update */
37/* Notice extracted from xxHash homepage:
Willy Tarreaub5684e02015-04-27 11:59:40 +020038
Dragan Dosende374432020-12-22 12:00:37 +010039xxHash is an extremely fast hash algorithm, running at RAM speed limits.
Willy Tarreaub5684e02015-04-27 11:59:40 +020040It also successfully passes all tests from the SMHasher suite.
41
42Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
43
44Name Speed Q.Score Author
45xxHash 5.4 GB/s 10
46CrapWow 3.2 GB/s 2 Andrew
47MumurHash 3a 2.7 GB/s 10 Austin Appleby
48SpookyHash 2.0 GB/s 10 Bob Jenkins
49SBox 1.4 GB/s 9 Bret Mulvey
50Lookup3 1.2 GB/s 9 Bob Jenkins
51SuperFastHash 1.2 GB/s 1 Paul Hsieh
52CityHash64 1.05 GB/s 10 Pike & Alakuijala
53FNV 0.55 GB/s 5 Fowler, Noll, Vo
54CRC32 0.43 GB/s 9
55MD5-32 0.33 GB/s 10 Ronald L. Rivest
56SHA1-32 0.28 GB/s 10
57
58Q.Score is a measure of quality of the hash function.
59It depends on successfully passing SMHasher test set.
6010 is a perfect score.
Willy Tarreaub5684e02015-04-27 11:59:40 +020061
Dragan Dosende374432020-12-22 12:00:37 +010062Note: SMHasher's CRC32 implementation is not the fastest one.
63Other speed-oriented implementations can be faster,
64especially in combination with PCLMUL instruction:
65https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
66
67A 64-bit version, named XXH64, is available since r35.
68It offers much better speed, but for 64-bit applications only.
69Name Speed on 64 bits Speed on 32 bits
70XXH64 13.8 GB/s 1.9 GB/s
71XXH32 6.8 GB/s 6.0 GB/s
72*/
Willy Tarreaub5684e02015-04-27 11:59:40 +020073
74#if defined (__cplusplus)
75extern "C" {
76#endif
77
Dragan Dosen6f7cc112020-12-22 14:46:47 +010078#include <haproxy/defaults.h>
79
Dragan Dosende374432020-12-22 12:00:37 +010080/* ****************************
81 * INLINE mode
82 ******************************/
83/*!
84 * XXH_INLINE_ALL (and XXH_PRIVATE_API)
85 * Use these build macros to inline xxhash into the target unit.
86 * Inlining improves performance on small inputs, especially when the length is
87 * expressed as a compile-time constant:
88 *
89 * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
90 *
91 * It also keeps xxHash symbols private to the unit, so they are not exported.
92 *
93 * Usage:
94 * #define XXH_INLINE_ALL
95 * #include "xxhash.h"
96 *
97 * Do not compile and link xxhash.o as a separate object, as it is not useful.
98 */
99#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
100 && !defined(XXH_INLINE_ALL_31684351384)
101 /* this section should be traversed only once */
102# define XXH_INLINE_ALL_31684351384
103 /* give access to the advanced API, required to compile implementations */
104# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */
105# define XXH_STATIC_LINKING_ONLY
106 /* make all functions private */
107# undef XXH_PUBLIC_API
108# if defined(__GNUC__)
109# define XXH_PUBLIC_API static __inline __attribute__((unused))
110# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
111# define XXH_PUBLIC_API static inline
112# elif defined(_MSC_VER)
113# define XXH_PUBLIC_API static __inline
114# else
115 /* note: this version may generate warnings for unused static functions */
116# define XXH_PUBLIC_API static
117# endif
Willy Tarreaub5684e02015-04-27 11:59:40 +0200118
Dragan Dosende374432020-12-22 12:00:37 +0100119 /*
120 * This part deals with the special case where a unit wants to inline xxHash,
121 * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
122 * as part of some previously included *.h header file.
123 * Without further action, the new include would just be ignored,
124 * and functions would effectively _not_ be inlined (silent failure).
125 * The following macros solve this situation by prefixing all inlined names,
126 * avoiding naming collision with previous inclusions.
127 */
128# ifdef XXH_NAMESPACE
129# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
130 /*
131 * Note: Alternative: #undef all symbols (it's a pretty large list).
132 * Without #error: it compiles, but functions are actually not inlined.
133 */
134# endif
135# define XXH_NAMESPACE XXH_INLINE_
136 /*
137 * Some identifiers (enums, type names) are not symbols, but they must
138 * still be renamed to avoid redeclaration.
139 * Alternative solution: do not redeclare them.
140 * However, this requires some #ifdefs, and is a more dispersed action.
141 * Meanwhile, renaming can be achieved in a single block
142 */
143# define XXH_IPREF(Id) XXH_INLINE_ ## Id
144# define XXH_OK XXH_IPREF(XXH_OK)
145# define XXH_ERROR XXH_IPREF(XXH_ERROR)
146# define XXH_errorcode XXH_IPREF(XXH_errorcode)
147# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)
148# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)
149# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
150# define XXH32_state_s XXH_IPREF(XXH32_state_s)
151# define XXH32_state_t XXH_IPREF(XXH32_state_t)
152# define XXH64_state_s XXH_IPREF(XXH64_state_s)
153# define XXH64_state_t XXH_IPREF(XXH64_state_t)
154# define XXH3_state_s XXH_IPREF(XXH3_state_s)
155# define XXH3_state_t XXH_IPREF(XXH3_state_t)
156# define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
157 /* Ensure the header is parsed again, even if it was previously included */
158# undef XXHASH_H_5627135585666179
159# undef XXHASH_H_STATIC_13879238742
160#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
Willy Tarreaub5684e02015-04-27 11:59:40 +0200161
162
Dragan Dosende374432020-12-22 12:00:37 +0100163
164/* ****************************************************************
165 * Stable API
166 *****************************************************************/
167#ifndef XXHASH_H_5627135585666179
168#define XXHASH_H_5627135585666179 1
169
170/* specific declaration modes for Windows */
171#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
172# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
173# ifdef XXH_EXPORT
174# define XXH_PUBLIC_API __declspec(dllexport)
175# elif XXH_IMPORT
176# define XXH_PUBLIC_API __declspec(dllimport)
177# endif
178# else
179# define XXH_PUBLIC_API /* do nothing */
180# endif
181#endif
182
183/*!
184 * XXH_NAMESPACE, aka Namespace Emulation:
185 *
186 * If you want to include _and expose_ xxHash functions from within your own
187 * library, but also want to avoid symbol collisions with other libraries which
188 * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
189 * any public symbol from xxhash library with the value of XXH_NAMESPACE
190 * (therefore, avoid empty or numeric values).
191 *
192 * Note that no change is required within the calling program as long as it
193 * includes `xxhash.h`: Regular symbol names will be automatically translated
194 * by this header.
195 */
196#ifdef XXH_NAMESPACE
197# define XXH_CAT(A,B) A##B
198# define XXH_NAME2(A,B) XXH_CAT(A,B)
199# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
200/* XXH32 */
201# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
202# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
203# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
204# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
205# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
206# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
207# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
208# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
209# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
210/* XXH64 */
211# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
212# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
213# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
214# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
215# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
216# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
217# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
218# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
219# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
220/* XXH3_64bits */
221# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
222# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
223# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
224# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
225# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
226# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
227# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
228# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
229# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
230# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
231# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
232# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
233/* XXH3_128bits */
234# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
235# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
236# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
237# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
238# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
239# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
240# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
241# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
242# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
243# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
244# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
245# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
246# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
247#endif
248
249
250/* *************************************
251* Version
252***************************************/
253#define XXH_VERSION_MAJOR 0
254#define XXH_VERSION_MINOR 8
255#define XXH_VERSION_RELEASE 0
256#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
257XXH_PUBLIC_API unsigned XXH_versionNumber (void);
258
259
260/* ****************************
261* Definitions
262******************************/
263#include <stddef.h> /* size_t */
Willy Tarreaub5684e02015-04-27 11:59:40 +0200264typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
265
266
Dragan Dosende374432020-12-22 12:00:37 +0100267/*-**********************************************************************
268* 32-bit hash
269************************************************************************/
270#if !defined (__VMS) \
271 && (defined (__cplusplus) \
272 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
273# include <stdint.h>
274 typedef uint32_t XXH32_hash_t;
275#else
276# include <limits.h>
277# if UINT_MAX == 0xFFFFFFFFUL
278 typedef unsigned int XXH32_hash_t;
279# else
280# if ULONG_MAX == 0xFFFFFFFFUL
281 typedef unsigned long XXH32_hash_t;
282# else
283# error "unsupported platform: need a 32-bit type"
284# endif
285# endif
286#endif
Willy Tarreaub5684e02015-04-27 11:59:40 +0200287
Dragan Dosende374432020-12-22 12:00:37 +0100288/*!
289 * XXH32():
290 * Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
291 * The memory between input & input+length must be valid (allocated and read-accessible).
292 * "seed" can be used to alter the result predictably.
293 * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
294 *
295 * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
296 * and offers true 64/128 bit hash results. It provides a superior level of
297 * dispersion, and greatly reduces the risks of collisions.
298 */
299XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
Willy Tarreaub5684e02015-04-27 11:59:40 +0200300
Dragan Dosende374432020-12-22 12:00:37 +0100301/******* Streaming *******/
Willy Tarreaub5684e02015-04-27 11:59:40 +0200302
303/*
Dragan Dosende374432020-12-22 12:00:37 +0100304 * Streaming functions generate the xxHash value from an incrememtal input.
305 * This method is slower than single-call functions, due to state management.
306 * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
307 *
308 * An XXH state must first be allocated using `XXH*_createState()`.
309 *
310 * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
311 *
312 * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
313 *
314 * The function returns an error code, with 0 meaning OK, and any other value
315 * meaning there is an error.
316 *
317 * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
318 * This function returns the nn-bits hash as an int or long long.
319 *
320 * It's still possible to continue inserting input into the hash state after a
321 * digest, and generate new hash values later on by invoking `XXH*_digest()`.
322 *
323 * When done, release the state using `XXH*_freeState()`.
324 */
325
326typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */
327XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
328XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);
329XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
330
331XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed);
332XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
333XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
334
335/******* Canonical representation *******/
336
337/*
338 * The default return values from XXH functions are unsigned 32 and 64 bit
339 * integers.
340 * This the simplest and fastest format for further post-processing.
341 *
342 * However, this leaves open the question of what is the order on the byte level,
343 * since little and big endian conventions will store the same number differently.
344 *
345 * The canonical representation settles this issue by mandating big-endian
346 * convention, the same convention as human-readable numbers (large digits first).
347 *
348 * When writing hash values to storage, sending them over a network, or printing
349 * them, it's highly recommended to use the canonical representation to ensure
350 * portability across a wider range of systems, present and future.
351 *
352 * The following functions allow transformation of hash values to and from
353 * canonical format.
354 */
355
356typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
357XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
358XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
359
360
361#ifndef XXH_NO_LONG_LONG
362/*-**********************************************************************
363* 64-bit hash
364************************************************************************/
365#if !defined (__VMS) \
366 && (defined (__cplusplus) \
367 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
368# include <stdint.h>
369 typedef uint64_t XXH64_hash_t;
370#else
371 /* the following type must have a width of 64-bit */
372 typedef unsigned long long XXH64_hash_t;
373#endif
374
375/*!
376 * XXH64():
377 * Returns the 64-bit hash of sequence of length @length stored at memory
378 * address @input.
379 * @seed can be used to alter the result predictably.
380 *
381 * This function usually runs faster on 64-bit systems, but slower on 32-bit
382 * systems (see benchmark).
383 *
384 * Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
385 * and offers true 64/128 bit hash results. It provides a superior level of
386 * dispersion, and greatly reduces the risks of collisions.
387 */
388XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
389
390/******* Streaming *******/
391typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
392XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
393XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
394XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
395
396XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);
397XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
398XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
399
400/******* Canonical representation *******/
401typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
402XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
403XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
404
405
406/*-**********************************************************************
407* XXH3 64-bit variant
408************************************************************************/
409
410/* ************************************************************************
411 * XXH3 is a new hash algorithm featuring:
412 * - Improved speed for both small and large inputs
413 * - True 64-bit and 128-bit outputs
414 * - SIMD acceleration
415 * - Improved 32-bit viability
416 *
417 * Speed analysis methodology is explained here:
418 *
419 * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
420 *
421 * In general, expect XXH3 to run about ~2x faster on large inputs and >3x
422 * faster on small ones compared to XXH64, though exact differences depend on
423 * the platform.
424 *
425 * The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
426 * on all platforms.
427 *
428 * It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
429 *
430 * Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
431 * XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
432 * explained in the implementation.
433 *
434 * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
435 * ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
436 *
437 * XXH3 offers 2 variants, _64bits and _128bits.
438 * When only 64 bits are needed, prefer calling the _64bits variant, as it
439 * reduces the amount of mixing, resulting in faster speed on small inputs.
440 *
441 * It's also generally simpler to manipulate a scalar return type than a struct.
442 *
443 * The 128-bit version adds additional strength, but it is slightly slower.
444 *
445 * The XXH3 algorithm is still in development.
446 * The results it produces may still change in future versions.
447 *
448 * Results produced by v0.7.x are not comparable with results from v0.7.y.
449 * However, the API is completely stable, and it can safely be used for
450 * ephemeral data (local sessions).
451 *
452 * Avoid storing values in long-term storage until the algorithm is finalized.
453 * XXH3's return values will be officially finalized upon reaching v0.8.0.
454 *
455 * After which, return values of XXH3 and XXH128 will no longer change in
456 * future versions.
457 *
458 * The API supports one-shot hashing, streaming mode, and custom secrets.
459 */
460
461/* XXH3_64bits():
462 * default 64-bit variant, using default secret and default seed of 0.
463 * It's the fastest variant. */
464XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
465
466/*
467 * XXH3_64bits_withSeed():
468 * This variant generates a custom secret on the fly
469 * based on default secret altered using the `seed` value.
470 * While this operation is decently fast, note that it's not completely free.
471 * Note: seed==0 produces the same results as XXH3_64bits().
472 */
473XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
474
475/*
476 * XXH3_64bits_withSecret():
477 * It's possible to provide any blob of bytes as a "secret" to generate the hash.
478 * This makes it more difficult for an external actor to prepare an intentional collision.
479 * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
480 * However, the quality of produced hash values depends on secret's entropy.
481 * Technically, the secret must look like a bunch of random bytes.
482 * Avoid "trivial" or structured data such as repeated sequences or a text document.
483 * Whenever unsure about the "randomness" of the blob of bytes,
484 * consider relabelling it as a "custom seed" instead,
485 * and employ "XXH3_generateSecret()" (see below)
486 * to generate a high entropy secret derived from the custom seed.
487 */
488#define XXH3_SECRET_SIZE_MIN 136
489XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
490
491
492/******* Streaming *******/
493/*
494 * Streaming requires state maintenance.
495 * This operation costs memory and CPU.
496 * As a consequence, streaming is slower than one-shot hashing.
497 * For better performance, prefer one-shot functions whenever applicable.
498 */
499typedef struct XXH3_state_s XXH3_state_t;
500XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
501XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
502XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
503
504/*
505 * XXH3_64bits_reset():
506 * Initialize with default parameters.
507 * digest will be equivalent to `XXH3_64bits()`.
508 */
509XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
510/*
511 * XXH3_64bits_reset_withSeed():
512 * Generate a custom secret from `seed`, and store it into `statePtr`.
513 * digest will be equivalent to `XXH3_64bits_withSeed()`.
514 */
515XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
516/*
517 * XXH3_64bits_reset_withSecret():
518 * `secret` is referenced, it _must outlive_ the hash streaming session.
519 * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
520 * and the quality of produced hash values depends on secret's entropy
521 * (secret's content should look like a bunch of random bytes).
522 * When in doubt about the randomness of a candidate `secret`,
523 * consider employing `XXH3_generateSecret()` instead (see below).
524 */
525XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
526
527XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
528XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);
529
530/* note : canonical representation of XXH3 is the same as XXH64
531 * since they both produce XXH64_hash_t values */
532
533
534/*-**********************************************************************
535* XXH3 128-bit variant
536************************************************************************/
537
538typedef struct {
539 XXH64_hash_t low64;
540 XXH64_hash_t high64;
541} XXH128_hash_t;
542
543XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
544XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
545XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
546
547/******* Streaming *******/
548/*
549 * Streaming requires state maintenance.
550 * This operation costs memory and CPU.
551 * As a consequence, streaming is slower than one-shot hashing.
552 * For better performance, prefer one-shot functions whenever applicable.
553 *
554 * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
555 * Use already declared XXH3_createState() and XXH3_freeState().
556 *
557 * All reset and streaming functions have same meaning as their 64-bit counterpart.
558 */
559
560XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
561XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
562XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
563
564XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
565XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
566
567/* Following helper functions make it possible to compare XXH128_hast_t values.
568 * Since XXH128_hash_t is a structure, this capability is not offered by the language.
569 * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
570
571/*!
572 * XXH128_isEqual():
573 * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
574 */
575XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
576
577/*!
578 * XXH128_cmp():
579 *
580 * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
581 *
582 * return: >0 if *h128_1 > *h128_2
583 * =0 if *h128_1 == *h128_2
584 * <0 if *h128_1 < *h128_2
585 */
586XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
587
588
589/******* Canonical representation *******/
590typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
591XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
592XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
593
594
595#endif /* XXH_NO_LONG_LONG */
596
597#endif /* XXHASH_H_5627135585666179 */
598
599
600
601#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
602#define XXHASH_H_STATIC_13879238742
603/* ****************************************************************************
604 * This section contains declarations which are not guaranteed to remain stable.
605 * They may change in future versions, becoming incompatible with a different
606 * version of the library.
607 * These declarations should only be used with static linking.
608 * Never use them in association with dynamic linking!
609 ***************************************************************************** */
610
611/*
612 * These definitions are only present to allow static allocation
613 * of XXH states, on stack or in a struct, for example.
614 * Never **ever** access their members directly.
615 */
616
617struct XXH32_state_s {
618 XXH32_hash_t total_len_32;
619 XXH32_hash_t large_len;
620 XXH32_hash_t v1;
621 XXH32_hash_t v2;
622 XXH32_hash_t v3;
623 XXH32_hash_t v4;
624 XXH32_hash_t mem32[4];
625 XXH32_hash_t memsize;
626 XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */
627}; /* typedef'd to XXH32_state_t */
628
629
630#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */
631
632struct XXH64_state_s {
633 XXH64_hash_t total_len;
634 XXH64_hash_t v1;
635 XXH64_hash_t v2;
636 XXH64_hash_t v3;
637 XXH64_hash_t v4;
638 XXH64_hash_t mem64[4];
639 XXH32_hash_t memsize;
640 XXH32_hash_t reserved32; /* required for padding anyway */
641 XXH64_hash_t reserved64; /* never read nor write, might be removed in a future version */
642}; /* typedef'd to XXH64_state_t */
643
644#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */
645# include <stdalign.h>
646# define XXH_ALIGN(n) alignas(n)
647#elif defined(__GNUC__)
648# define XXH_ALIGN(n) __attribute__ ((aligned(n)))
649#elif defined(_MSC_VER)
650# define XXH_ALIGN(n) __declspec(align(n))
651#else
652# define XXH_ALIGN(n) /* disabled */
653#endif
654
655/* Old GCC versions only accept the attribute after the type in structures. */
656#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
657 && defined(__GNUC__)
658# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
659#else
660# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
661#endif
662
663#define XXH3_INTERNALBUFFER_SIZE 256
664#define XXH3_SECRET_DEFAULT_SIZE 192
665struct XXH3_state_s {
666 XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
667 /* used to store a custom secret generated from a seed */
668 XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
669 XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
670 XXH32_hash_t bufferedSize;
671 XXH32_hash_t reserved32;
672 size_t nbStripesSoFar;
673 XXH64_hash_t totalLen;
674 size_t nbStripesPerBlock;
675 size_t secretLimit;
676 XXH64_hash_t seed;
677 XXH64_hash_t reserved64;
678 const unsigned char* extSecret; /* reference to external secret;
679 * if == NULL, use .customSecret instead */
680 /* note: there may be some padding at the end due to alignment on 64 bytes */
681}; /* typedef'd to XXH3_state_t */
682
683#undef XXH_ALIGN_MEMBER
684
685/* When the XXH3_state_t structure is merely emplaced on stack,
686 * it should be initialized with XXH3_INITSTATE() or a memset()
687 * in case its first reset uses XXH3_NNbits_reset_withSeed().
688 * This init can be omitted if the first reset uses default or _withSecret mode.
689 * This operation isn't necessary when the state is created with XXH3_createState().
690 * Note that this doesn't prepare the state for a streaming operation,
691 * it's still necessary to use XXH3_NNbits_reset*() afterwards.
692 */
693#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }
694
695
696/* === Experimental API === */
697/* Symbols defined below must be considered tied to a specific library version. */
698
699/*
700 * XXH3_generateSecret():
701 *
702 * Derive a high-entropy secret from any user-defined content, named customSeed.
703 * The generated secret can be used in combination with `*_withSecret()` functions.
704 * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
705 * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
706 *
707 * The function accepts as input a custom seed of any length and any content,
708 * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
709 * into an already allocated buffer secretBuffer.
710 * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
711 *
712 * The generated secret can then be used with any `*_withSecret()` variant.
713 * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
714 * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
715 * are part of this list. They all accept a `secret` parameter
716 * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
717 * _and_ feature very high entropy (consist of random-looking bytes).
718 * These conditions can be a high bar to meet, so
719 * this function can be used to generate a secret of proper quality.
720 *
721 * customSeed can be anything. It can have any size, even small ones,
722 * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.
723 * The resulting `secret` will nonetheless provide all expected qualities.
724 *
725 * Supplying NULL as the customSeed copies the default secret into `secretBuffer`.
726 * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
727 */
728XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);
729
730
731/* simple short-cut to pre-selected XXH3_128bits variant */
732XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
733
734
735#endif /* XXH_NO_LONG_LONG */
736
737
738#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
739# define XXH_IMPLEMENTATION
740#endif
741
742#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
743
744
745/* ======================================================================== */
746/* ======================================================================== */
747/* ======================================================================== */
748
749
750/*-**********************************************************************
751 * xxHash implementation
752 *-**********************************************************************
753 * xxHash's implementation used to be hosted inside xxhash.c.
754 *
755 * However, inlining requires implementation to be visible to the compiler,
756 * hence be included alongside the header.
757 * Previously, implementation was hosted inside xxhash.c,
758 * which was then #included when inlining was activated.
759 * This construction created issues with a few build and install systems,
760 * as it required xxhash.c to be stored in /include directory.
761 *
762 * xxHash implementation is now directly integrated within xxhash.h.
763 * As a consequence, xxhash.c is no longer needed in /include.
764 *
765 * xxhash.c is still available and is still useful.
766 * In a "normal" setup, when xxhash is not inlined,
767 * xxhash.h only exposes the prototypes and public symbols,
768 * while xxhash.c can be built into an object file xxhash.o
769 * which can then be linked into the final binary.
770 ************************************************************************/
771
772#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
773 || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
774# define XXH_IMPLEM_13a8737387
775
776/* *************************************
777* Tuning parameters
778***************************************/
779/*!
780 * XXH_FORCE_MEMORY_ACCESS:
781 * By default, access to unaligned memory is controlled by `memcpy()`, which is
782 * safe and portable.
783 *
784 * Unfortunately, on some target/compiler combinations, the generated assembly
785 * is sub-optimal.
786 *
787 * The below switch allow selection of a different access method
788 * in the search for improved performance.
789 * Method 0 (default):
790 * Use `memcpy()`. Safe and portable. Default.
791 * Method 1:
792 * `__attribute__((packed))` statement. It depends on compiler extensions
793 * and is therefore not portable.
Willy Tarreau4acb99f2021-02-04 17:02:39 +0100794 * This method is safe _if_ your compiler supports it,
795 * and *generally* as fast or faster than `memcpy`.
Dragan Dosende374432020-12-22 12:00:37 +0100796 * Method 2:
797 * Direct access via cast. This method doesn't depend on the compiler but
798 * violates the C standard.
799 * It can generate buggy code on targets which do not support unaligned
800 * memory accesses.
801 * But in some circumstances, it's the only known way to get the most
Willy Tarreau4acb99f2021-02-04 17:02:39 +0100802 * performance.
Dragan Dosende374432020-12-22 12:00:37 +0100803 * Method 3:
804 * Byteshift. This can generate the best code on old compilers which don't
805 * inline small `memcpy()` calls, and it might also be faster on big-endian
806 * systems which lack a native byteswap instruction.
807 * See https://stackoverflow.com/a/32095106/646947 for details.
808 * Prefer these methods in priority order (0 > 1 > 2 > 3)
809 */
810#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
Willy Tarreau4acb99f2021-02-04 17:02:39 +0100811 /* prefer __packed__ structures (method 1) for gcc on armv7 and armv8 */
812# if !defined(__clang__) && ( \
813 (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
814 (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)) )
Dragan Dosende374432020-12-22 12:00:37 +0100815# define XXH_FORCE_MEMORY_ACCESS 1
816# endif
817#endif
818
819/*!
820 * XXH_ACCEPT_NULL_INPUT_POINTER:
821 * If the input pointer is NULL, xxHash's default behavior is to dereference it,
822 * triggering a segfault.
823 * When this macro is enabled, xxHash actively checks the input for a null pointer.
824 * If it is, the result for null input pointers is the same as a zero-length input.
825 */
826#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */
827# define XXH_ACCEPT_NULL_INPUT_POINTER 0
828#endif
829
830/*!
831 * XXH_FORCE_ALIGN_CHECK:
832 * This is an important performance trick
833 * for architectures without decent unaligned memory access performance.
834 * It checks for input alignment, and when conditions are met,
835 * uses a "fast path" employing direct 32-bit/64-bit read,
836 * resulting in _dramatically faster_ read speed.
837 *
838 * The check costs one initial branch per hash, which is generally negligible, but not zero.
839 * Moreover, it's not useful to generate binary for an additional code path
Ilya Shipitsin1e9a6662021-01-05 22:10:46 +0500840 * if memory access uses same instruction for both aligned and unaligned addresses.
Dragan Dosende374432020-12-22 12:00:37 +0100841 *
842 * In these cases, the alignment check can be removed by setting this macro to 0.
843 * Then the code will always use unaligned memory access.
844 * Align check is automatically disabled on x86, x64 & arm64,
845 * which are platforms known to offer good unaligned memory accesses performance.
846 *
847 * This option does not affect XXH3 (only XXH32 and XXH64).
848 */
849#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
850# if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \
851 || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */
852# define XXH_FORCE_ALIGN_CHECK 0
853# else
854# define XXH_FORCE_ALIGN_CHECK 1
855# endif
856#endif
857
858/*!
859 * XXH_NO_INLINE_HINTS:
860 *
861 * By default, xxHash tries to force the compiler to inline almost all internal
862 * functions.
863 *
864 * This can usually improve performance due to reduced jumping and improved
865 * constant folding, but significantly increases the size of the binary which
866 * might not be favorable.
867 *
868 * Additionally, sometimes the forced inlining can be detrimental to performance,
869 * depending on the architecture.
870 *
871 * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
872 * compiler full control on whether to inline or not.
873 *
874 * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
875 * -fno-inline with GCC or Clang, this will automatically be defined.
876 */
877#ifndef XXH_NO_INLINE_HINTS
878# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
879 || defined(__NO_INLINE__) /* -O0, -fno-inline */
880# define XXH_NO_INLINE_HINTS 1
881# else
882# define XXH_NO_INLINE_HINTS 0
883# endif
884#endif
885
886/*!
887 * XXH_REROLL:
888 * Whether to reroll XXH32_finalize, and XXH64_finalize,
889 * instead of using an unrolled jump table/if statement loop.
890 *
891 * This is automatically defined on -Os/-Oz on GCC and Clang.
892 */
893#ifndef XXH_REROLL
894# if defined(__OPTIMIZE_SIZE__)
895# define XXH_REROLL 1
896# else
897# define XXH_REROLL 0
898# endif
899#endif
900
901
902/* *************************************
903* Includes & Memory related functions
904***************************************/
905/*!
906 * Modify the local functions below should you wish to use
907 * different memory routines for malloc() and free()
908 */
909#include <stdlib.h>
910
911static void* XXH_malloc(size_t s) { return malloc(s); }
912static void XXH_free(void* p) { free(p); }
913
914/*! and for memcpy() */
915#include <string.h>
916static void* XXH_memcpy(void* dest, const void* src, size_t size)
917{
918 return memcpy(dest,src,size);
919}
920
921#include <limits.h> /* ULLONG_MAX */
922
923
924/* *************************************
925* Compiler Specific Options
926***************************************/
927#ifdef _MSC_VER /* Visual Studio warning fix */
928# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
929#endif
930
931#if XXH_NO_INLINE_HINTS /* disable inlining hints */
932# if defined(__GNUC__)
933# define XXH_FORCE_INLINE static __attribute__((unused))
934# else
935# define XXH_FORCE_INLINE static
936# endif
937# define XXH_NO_INLINE static
938/* enable inlining hints */
939#elif defined(_MSC_VER) /* Visual Studio */
940# define XXH_FORCE_INLINE static __forceinline
941# define XXH_NO_INLINE static __declspec(noinline)
942#elif defined(__GNUC__)
943# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
944# define XXH_NO_INLINE static __attribute__((noinline))
945#elif defined (__cplusplus) \
946 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */
947# define XXH_FORCE_INLINE static inline
948# define XXH_NO_INLINE static
949#else
950# define XXH_FORCE_INLINE static
951# define XXH_NO_INLINE static
952#endif
953
954
955
956/* *************************************
957* Debug
958***************************************/
959/*
960 * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
961 * compiler's command line options. The value must be a number.
962 */
963#ifndef XXH_DEBUGLEVEL
964# ifdef DEBUGLEVEL /* backwards compat */
965# define XXH_DEBUGLEVEL DEBUGLEVEL
966# else
967# define XXH_DEBUGLEVEL 0
968# endif
969#endif
Willy Tarreaub5684e02015-04-27 11:59:40 +0200970
Dragan Dosende374432020-12-22 12:00:37 +0100971#if (XXH_DEBUGLEVEL>=1)
972# include <assert.h> /* note: can still be disabled with NDEBUG */
973# define XXH_ASSERT(c) assert(c)
974#else
975# define XXH_ASSERT(c) ((void)0)
976#endif
977
978/* note: use after variable declarations */
979#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
980
981
982/* *************************************
983* Basic Types
984***************************************/
985#if !defined (__VMS) \
986 && (defined (__cplusplus) \
987 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
988# include <stdint.h>
989 typedef uint8_t xxh_u8;
990#else
991 typedef unsigned char xxh_u8;
992#endif
993typedef XXH32_hash_t xxh_u32;
994
995#ifdef XXH_OLD_NAMES
996# define BYTE xxh_u8
997# define U8 xxh_u8
998# define U32 xxh_u32
999#endif
1000
1001/* *** Memory access *** */
1002
1003#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
1004/*
1005 * Manual byteshift. Best for old compilers which don't inline memcpy.
1006 * We actually directly use XXH_readLE32 and XXH_readBE32.
1007 */
1008#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
1009
1010/*
1011 * Force direct memory access. Only works on CPU which support unaligned memory
1012 * access in hardware.
1013 */
1014static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
1015
1016#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
1017
1018/*
1019 * __pack instructions are safer but compiler specific, hence potentially
1020 * problematic for some compilers.
1021 *
1022 * Currently only defined for GCC and ICC.
1023 */
1024#ifdef XXH_OLD_NAMES
1025typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
1026#endif
1027static xxh_u32 XXH_read32(const void* ptr)
1028{
1029 typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
1030 return ((const xxh_unalign*)ptr)->u32;
1031}
1032
1033#else
1034
1035/*
1036 * Portable and safe solution. Generally efficient.
1037 * see: https://stackoverflow.com/a/32095106/646947
1038 */
1039static xxh_u32 XXH_read32(const void* memPtr)
1040{
1041 xxh_u32 val;
1042 memcpy(&val, memPtr, sizeof(val));
1043 return val;
1044}
1045
1046#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
1047
1048
Ilya Shipitsin1e9a6662021-01-05 22:10:46 +05001049/* *** Endianness *** */
Dragan Dosende374432020-12-22 12:00:37 +01001050typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
1051
1052/*!
1053 * XXH_CPU_LITTLE_ENDIAN:
1054 * Defined to 1 if the target is little endian, or 0 if it is big endian.
1055 * It can be defined externally, for example on the compiler command line.
1056 *
1057 * If it is not defined, a runtime check (which is usually constant folded)
1058 * is used instead.
1059 */
1060#ifndef XXH_CPU_LITTLE_ENDIAN
1061/*
1062 * Try to detect endianness automatically, to avoid the nonstandard behavior
1063 * in `XXH_isLittleEndian()`
1064 */
1065# if defined(_WIN32) /* Windows is always little endian */ \
1066 || defined(__LITTLE_ENDIAN__) \
1067 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1068# define XXH_CPU_LITTLE_ENDIAN 1
1069# elif defined(__BIG_ENDIAN__) \
1070 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
1071# define XXH_CPU_LITTLE_ENDIAN 0
1072# else
1073/*
1074 * runtime test, presumed to simplify to a constant by compiler
1075 */
1076static int XXH_isLittleEndian(void)
1077{
1078 /*
1079 * Portable and well-defined behavior.
1080 * Don't use static: it is detrimental to performance.
1081 */
1082 const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
1083 return one.c[0];
1084}
1085# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
1086# endif
1087#endif
1088
1089
1090
1091
1092/* ****************************************
1093* Compiler-specific Functions and Macros
1094******************************************/
1095#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
1096
1097#ifdef __has_builtin
1098# define XXH_HAS_BUILTIN(x) __has_builtin(x)
1099#else
1100# define XXH_HAS_BUILTIN(x) 0
1101#endif
1102
1103#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
1104 && XXH_HAS_BUILTIN(__builtin_rotateleft64)
1105# define XXH_rotl32 __builtin_rotateleft32
1106# define XXH_rotl64 __builtin_rotateleft64
1107/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
1108#elif defined(_MSC_VER)
1109# define XXH_rotl32(x,r) _rotl(x,r)
1110# define XXH_rotl64(x,r) _rotl64(x,r)
1111#else
1112# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
1113# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
1114#endif
1115
1116#if defined(_MSC_VER) /* Visual Studio */
1117# define XXH_swap32 _byteswap_ulong
1118#elif XXH_GCC_VERSION >= 403
1119# define XXH_swap32 __builtin_bswap32
1120#else
1121static xxh_u32 XXH_swap32 (xxh_u32 x)
1122{
1123 return ((x << 24) & 0xff000000 ) |
1124 ((x << 8) & 0x00ff0000 ) |
1125 ((x >> 8) & 0x0000ff00 ) |
1126 ((x >> 24) & 0x000000ff );
1127}
1128#endif
Willy Tarreaub5684e02015-04-27 11:59:40 +02001129
1130
Dragan Dosende374432020-12-22 12:00:37 +01001131/* ***************************
1132* Memory reads
Willy Tarreaub5684e02015-04-27 11:59:40 +02001133*****************************/
Dragan Dosende374432020-12-22 12:00:37 +01001134typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
Willy Tarreaub5684e02015-04-27 11:59:40 +02001135
1136/*
Dragan Dosende374432020-12-22 12:00:37 +01001137 * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
1138 *
1139 * This is ideal for older compilers which don't inline memcpy.
1140 */
1141#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
Willy Tarreaub5684e02015-04-27 11:59:40 +02001142
Dragan Dosende374432020-12-22 12:00:37 +01001143XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
1144{
1145 const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
1146 return bytePtr[0]
1147 | ((xxh_u32)bytePtr[1] << 8)
1148 | ((xxh_u32)bytePtr[2] << 16)
1149 | ((xxh_u32)bytePtr[3] << 24);
1150}
1151
1152XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
1153{
1154 const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
1155 return bytePtr[3]
1156 | ((xxh_u32)bytePtr[2] << 8)
1157 | ((xxh_u32)bytePtr[1] << 16)
1158 | ((xxh_u32)bytePtr[0] << 24);
1159}
1160
1161#else
1162XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
1163{
1164 return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
1165}
1166
1167static xxh_u32 XXH_readBE32(const void* ptr)
1168{
1169 return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
1170}
1171#endif
1172
1173XXH_FORCE_INLINE xxh_u32
1174XXH_readLE32_align(const void* ptr, XXH_alignment align)
1175{
1176 if (align==XXH_unaligned) {
1177 return XXH_readLE32(ptr);
1178 } else {
1179 return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
1180 }
1181}
1182
1183
1184/* *************************************
1185* Misc
1186***************************************/
1187XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
1188
1189
1190/* *******************************************************************
1191* 32-bit hash functions
1192*********************************************************************/
1193static const xxh_u32 XXH_PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */
1194static const xxh_u32 XXH_PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */
1195static const xxh_u32 XXH_PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */
1196static const xxh_u32 XXH_PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */
1197static const xxh_u32 XXH_PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */
1198
1199#ifdef XXH_OLD_NAMES
1200# define PRIME32_1 XXH_PRIME32_1
1201# define PRIME32_2 XXH_PRIME32_2
1202# define PRIME32_3 XXH_PRIME32_3
1203# define PRIME32_4 XXH_PRIME32_4
1204# define PRIME32_5 XXH_PRIME32_5
1205#endif
1206
1207static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
1208{
1209 acc += input * XXH_PRIME32_2;
1210 acc = XXH_rotl32(acc, 13);
1211 acc *= XXH_PRIME32_1;
1212#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
1213 /*
1214 * UGLY HACK:
1215 * This inline assembly hack forces acc into a normal register. This is the
1216 * only thing that prevents GCC and Clang from autovectorizing the XXH32
1217 * loop (pragmas and attributes don't work for some resason) without globally
1218 * disabling SSE4.1.
1219 *
1220 * The reason we want to avoid vectorization is because despite working on
1221 * 4 integers at a time, there are multiple factors slowing XXH32 down on
1222 * SSE4:
1223 * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
1224 * newer chips!) making it slightly slower to multiply four integers at
1225 * once compared to four integers independently. Even when pmulld was
1226 * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
1227 * just to multiply unless doing a long operation.
1228 *
1229 * - Four instructions are required to rotate,
1230 * movqda tmp, v // not required with VEX encoding
1231 * pslld tmp, 13 // tmp <<= 13
1232 * psrld v, 19 // x >>= 19
1233 * por v, tmp // x |= tmp
1234 * compared to one for scalar:
1235 * roll v, 13 // reliably fast across the board
1236 * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason
1237 *
1238 * - Instruction level parallelism is actually more beneficial here because
1239 * the SIMD actually serializes this operation: While v1 is rotating, v2
1240 * can load data, while v3 can multiply. SSE forces them to operate
1241 * together.
1242 *
1243 * How this hack works:
1244 * __asm__("" // Declare an assembly block but don't declare any instructions
1245 * : // However, as an Input/Output Operand,
1246 * "+r" // constrain a read/write operand (+) as a general purpose register (r).
1247 * (acc) // and set acc as the operand
1248 * );
1249 *
1250 * Because of the 'r', the compiler has promised that seed will be in a
1251 * general purpose register and the '+' says that it will be 'read/write',
1252 * so it has to assume it has changed. It is like volatile without all the
1253 * loads and stores.
1254 *
1255 * Since the argument has to be in a normal register (not an SSE register),
1256 * each time XXH32_round is called, it is impossible to vectorize.
1257 */
1258 __asm__("" : "+r" (acc));
1259#endif
1260 return acc;
1261}
1262
1263/* mix all bits */
1264static xxh_u32 XXH32_avalanche(xxh_u32 h32)
1265{
1266 h32 ^= h32 >> 15;
1267 h32 *= XXH_PRIME32_2;
1268 h32 ^= h32 >> 13;
1269 h32 *= XXH_PRIME32_3;
1270 h32 ^= h32 >> 16;
1271 return(h32);
1272}
1273
1274#define XXH_get32bits(p) XXH_readLE32_align(p, align)
1275
1276static xxh_u32
1277XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
1278{
1279#define XXH_PROCESS1 do { \
1280 h32 += (*ptr++) * XXH_PRIME32_5; \
1281 h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \
1282} while (0)
1283
1284#define XXH_PROCESS4 do { \
1285 h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \
1286 ptr += 4; \
1287 h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \
1288} while (0)
1289
1290 /* Compact rerolled version */
1291 if (XXH_REROLL) {
1292 len &= 15;
1293 while (len >= 4) {
1294 XXH_PROCESS4;
1295 len -= 4;
1296 }
1297 while (len > 0) {
1298 XXH_PROCESS1;
1299 --len;
1300 }
1301 return XXH32_avalanche(h32);
1302 } else {
1303 switch(len&15) /* or switch(bEnd - p) */ {
1304 case 12: XXH_PROCESS4;
1305 /* fallthrough */
1306 case 8: XXH_PROCESS4;
1307 /* fallthrough */
1308 case 4: XXH_PROCESS4;
1309 return XXH32_avalanche(h32);
1310
1311 case 13: XXH_PROCESS4;
1312 /* fallthrough */
1313 case 9: XXH_PROCESS4;
1314 /* fallthrough */
1315 case 5: XXH_PROCESS4;
1316 XXH_PROCESS1;
1317 return XXH32_avalanche(h32);
1318
1319 case 14: XXH_PROCESS4;
1320 /* fallthrough */
1321 case 10: XXH_PROCESS4;
1322 /* fallthrough */
1323 case 6: XXH_PROCESS4;
1324 XXH_PROCESS1;
1325 XXH_PROCESS1;
1326 return XXH32_avalanche(h32);
1327
1328 case 15: XXH_PROCESS4;
1329 /* fallthrough */
1330 case 11: XXH_PROCESS4;
1331 /* fallthrough */
1332 case 7: XXH_PROCESS4;
1333 /* fallthrough */
1334 case 3: XXH_PROCESS1;
1335 /* fallthrough */
1336 case 2: XXH_PROCESS1;
1337 /* fallthrough */
1338 case 1: XXH_PROCESS1;
1339 /* fallthrough */
1340 case 0: return XXH32_avalanche(h32);
1341 }
1342 XXH_ASSERT(0);
1343 return h32; /* reaching this point is deemed impossible */
1344 }
1345}
1346
1347#ifdef XXH_OLD_NAMES
1348# define PROCESS1 XXH_PROCESS1
1349# define PROCESS4 XXH_PROCESS4
1350#else
1351# undef XXH_PROCESS1
1352# undef XXH_PROCESS4
1353#endif
1354
1355XXH_FORCE_INLINE xxh_u32
1356XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
1357{
1358 const xxh_u8* bEnd = input + len;
1359 xxh_u32 h32;
1360
1361#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
1362 if (input==NULL) {
1363 len=0;
1364 bEnd=input=(const xxh_u8*)(size_t)16;
1365 }
1366#endif
1367
1368 if (len>=16) {
1369 const xxh_u8* const limit = bEnd - 15;
1370 xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
1371 xxh_u32 v2 = seed + XXH_PRIME32_2;
1372 xxh_u32 v3 = seed + 0;
1373 xxh_u32 v4 = seed - XXH_PRIME32_1;
Willy Tarreaub5684e02015-04-27 11:59:40 +02001374
Dragan Dosende374432020-12-22 12:00:37 +01001375 do {
1376 v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
1377 v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
1378 v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
1379 v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
1380 } while (input < limit);
Willy Tarreaub5684e02015-04-27 11:59:40 +02001381
Dragan Dosende374432020-12-22 12:00:37 +01001382 h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)
1383 + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
1384 } else {
1385 h32 = seed + XXH_PRIME32_5;
1386 }
1387
1388 h32 += (xxh_u32)len;
1389
1390 return XXH32_finalize(h32, input, len&15, align);
1391}
1392
1393
1394XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
1395{
1396#if 0
1397 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
1398 XXH32_state_t state;
1399 XXH32_reset(&state, seed);
1400 XXH32_update(&state, (const xxh_u8*)input, len);
1401 return XXH32_digest(&state);
1402
1403#else
1404
1405 if (XXH_FORCE_ALIGN_CHECK) {
1406 if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */
1407 return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
1408 } }
1409
1410 return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
1411#endif
1412}
1413
1414
1415
1416/******* Hash streaming *******/
1417
1418XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
1419{
1420 return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
1421}
1422XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
1423{
1424 XXH_free(statePtr);
1425 return XXH_OK;
1426}
1427
1428XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
1429{
1430 memcpy(dstState, srcState, sizeof(*dstState));
1431}
1432
1433XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
1434{
1435 XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
1436 memset(&state, 0, sizeof(state));
1437 state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
1438 state.v2 = seed + XXH_PRIME32_2;
1439 state.v3 = seed + 0;
1440 state.v4 = seed - XXH_PRIME32_1;
1441 /* do not write into reserved, planned to be removed in a future version */
1442 memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
1443 return XXH_OK;
1444}
1445
1446
1447XXH_PUBLIC_API XXH_errorcode
1448XXH32_update(XXH32_state_t* state, const void* input, size_t len)
1449{
1450 if (input==NULL)
1451#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
1452 return XXH_OK;
1453#else
1454 return XXH_ERROR;
1455#endif
1456
1457 { const xxh_u8* p = (const xxh_u8*)input;
1458 const xxh_u8* const bEnd = p + len;
1459
1460 state->total_len_32 += (XXH32_hash_t)len;
1461 state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
1462
1463 if (state->memsize + len < 16) { /* fill in tmp buffer */
1464 XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
1465 state->memsize += (XXH32_hash_t)len;
1466 return XXH_OK;
1467 }
1468
1469 if (state->memsize) { /* some data left from previous update */
1470 XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
1471 { const xxh_u32* p32 = state->mem32;
1472 state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
1473 state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
1474 state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
1475 state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
1476 }
1477 p += 16-state->memsize;
1478 state->memsize = 0;
1479 }
1480
1481 if (p <= bEnd-16) {
1482 const xxh_u8* const limit = bEnd - 16;
1483 xxh_u32 v1 = state->v1;
1484 xxh_u32 v2 = state->v2;
1485 xxh_u32 v3 = state->v3;
1486 xxh_u32 v4 = state->v4;
1487
1488 do {
1489 v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
1490 v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
1491 v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
1492 v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
1493 } while (p<=limit);
1494
1495 state->v1 = v1;
1496 state->v2 = v2;
1497 state->v3 = v3;
1498 state->v4 = v4;
1499 }
1500
1501 if (p < bEnd) {
1502 XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
1503 state->memsize = (unsigned)(bEnd-p);
1504 }
1505 }
1506
1507 return XXH_OK;
1508}
1509
1510
1511XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
1512{
1513 xxh_u32 h32;
1514
1515 if (state->large_len) {
1516 h32 = XXH_rotl32(state->v1, 1)
1517 + XXH_rotl32(state->v2, 7)
1518 + XXH_rotl32(state->v3, 12)
1519 + XXH_rotl32(state->v4, 18);
1520 } else {
1521 h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
1522 }
1523
1524 h32 += state->total_len_32;
1525
1526 return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
1527}
1528
1529
1530/******* Canonical representation *******/
Willy Tarreaub5684e02015-04-27 11:59:40 +02001531
1532/*
Dragan Dosende374432020-12-22 12:00:37 +01001533 * The default return values from XXH functions are unsigned 32 and 64 bit
1534 * integers.
1535 *
1536 * The canonical representation uses big endian convention, the same convention
1537 * as human-readable numbers (large digits first).
1538 *
1539 * This way, hash values can be written into a file or buffer, remaining
1540 * comparable across different systems.
1541 *
1542 * The following functions allow transformation of hash values to and from their
1543 * canonical format.
1544 */
1545XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
1546{
1547 XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
1548 if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
1549 memcpy(dst, &hash, sizeof(*dst));
1550}
1551
1552XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
1553{
1554 return XXH_readBE32(src);
1555}
1556
1557
1558#ifndef XXH_NO_LONG_LONG
1559
1560/* *******************************************************************
1561* 64-bit hash functions
1562*********************************************************************/
1563
1564/******* Memory access *******/
1565
1566typedef XXH64_hash_t xxh_u64;
Willy Tarreaub5684e02015-04-27 11:59:40 +02001567
Dragan Dosende374432020-12-22 12:00:37 +01001568#ifdef XXH_OLD_NAMES
1569# define U64 xxh_u64
1570#endif
1571
1572/*!
1573 * XXH_REROLL_XXH64:
1574 * Whether to reroll the XXH64_finalize() loop.
1575 *
1576 * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
1577 * performance gain on 64-bit hosts, as only one jump is required.
1578 *
1579 * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit
1580 * registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial
1581 * to unroll. The code becomes ridiculously large (the largest function in the
1582 * binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is
1583 * also slightly faster because it fits into cache better and is more likely
1584 * to be inlined by the compiler.
1585 *
1586 * If XXH_REROLL is defined, this is ignored and the loop is always rerolled.
1587 */
1588#ifndef XXH_REROLL_XXH64
1589# if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
1590 || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
1591 || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
1592 || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
1593 || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
1594 || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
1595# define XXH_REROLL_XXH64 1
1596# else
1597# define XXH_REROLL_XXH64 0
1598# endif
1599#endif /* !defined(XXH_REROLL_XXH64) */
1600
1601#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
1602/*
1603 * Manual byteshift. Best for old compilers which don't inline memcpy.
1604 * We actually directly use XXH_readLE64 and XXH_readBE64.
1605 */
1606#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
Willy Tarreaub5684e02015-04-27 11:59:40 +02001607
Dragan Dosende374432020-12-22 12:00:37 +01001608/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
1609static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
Willy Tarreaub5684e02015-04-27 11:59:40 +02001610
Dragan Dosende374432020-12-22 12:00:37 +01001611#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
Willy Tarreaub5684e02015-04-27 11:59:40 +02001612
1613/*
Dragan Dosende374432020-12-22 12:00:37 +01001614 * __pack instructions are safer, but compiler specific, hence potentially
1615 * problematic for some compilers.
1616 *
1617 * Currently only defined for GCC and ICC.
1618 */
1619#ifdef XXH_OLD_NAMES
1620typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
1621#endif
1622static xxh_u64 XXH_read64(const void* ptr)
1623{
1624 typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
1625 return ((const xxh_unalign64*)ptr)->u64;
1626}
Willy Tarreaub5684e02015-04-27 11:59:40 +02001627
Dragan Dosende374432020-12-22 12:00:37 +01001628#else
Willy Tarreaub5684e02015-04-27 11:59:40 +02001629
Dragan Dosende374432020-12-22 12:00:37 +01001630/*
1631 * Portable and safe solution. Generally efficient.
1632 * see: https://stackoverflow.com/a/32095106/646947
1633 */
1634static xxh_u64 XXH_read64(const void* memPtr)
1635{
1636 xxh_u64 val;
1637 memcpy(&val, memPtr, sizeof(val));
1638 return val;
1639}
Willy Tarreaub5684e02015-04-27 11:59:40 +02001640
Dragan Dosende374432020-12-22 12:00:37 +01001641#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
Willy Tarreaub5684e02015-04-27 11:59:40 +02001642
Dragan Dosende374432020-12-22 12:00:37 +01001643#if defined(_MSC_VER) /* Visual Studio */
1644# define XXH_swap64 _byteswap_uint64
1645#elif XXH_GCC_VERSION >= 403
1646# define XXH_swap64 __builtin_bswap64
1647#else
1648static xxh_u64 XXH_swap64 (xxh_u64 x)
1649{
1650 return ((x << 56) & 0xff00000000000000ULL) |
1651 ((x << 40) & 0x00ff000000000000ULL) |
1652 ((x << 24) & 0x0000ff0000000000ULL) |
1653 ((x << 8) & 0x000000ff00000000ULL) |
1654 ((x >> 8) & 0x00000000ff000000ULL) |
1655 ((x >> 24) & 0x0000000000ff0000ULL) |
1656 ((x >> 40) & 0x000000000000ff00ULL) |
1657 ((x >> 56) & 0x00000000000000ffULL);
1658}
1659#endif
Willy Tarreaub5684e02015-04-27 11:59:40 +02001660
Dragan Dosende374432020-12-22 12:00:37 +01001661
1662/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
1663#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
1664
1665XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
1666{
1667 const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
1668 return bytePtr[0]
1669 | ((xxh_u64)bytePtr[1] << 8)
1670 | ((xxh_u64)bytePtr[2] << 16)
1671 | ((xxh_u64)bytePtr[3] << 24)
1672 | ((xxh_u64)bytePtr[4] << 32)
1673 | ((xxh_u64)bytePtr[5] << 40)
1674 | ((xxh_u64)bytePtr[6] << 48)
1675 | ((xxh_u64)bytePtr[7] << 56);
1676}
1677
1678XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
1679{
1680 const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
1681 return bytePtr[7]
1682 | ((xxh_u64)bytePtr[6] << 8)
1683 | ((xxh_u64)bytePtr[5] << 16)
1684 | ((xxh_u64)bytePtr[4] << 24)
1685 | ((xxh_u64)bytePtr[3] << 32)
1686 | ((xxh_u64)bytePtr[2] << 40)
1687 | ((xxh_u64)bytePtr[1] << 48)
1688 | ((xxh_u64)bytePtr[0] << 56);
1689}
1690
1691#else
1692XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
1693{
1694 return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
1695}
1696
1697static xxh_u64 XXH_readBE64(const void* ptr)
1698{
1699 return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
1700}
1701#endif
1702
1703XXH_FORCE_INLINE xxh_u64
1704XXH_readLE64_align(const void* ptr, XXH_alignment align)
1705{
1706 if (align==XXH_unaligned)
1707 return XXH_readLE64(ptr);
1708 else
1709 return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
1710}
1711
1712
1713/******* xxh64 *******/
1714
1715static const xxh_u64 XXH_PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
1716static const xxh_u64 XXH_PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
1717static const xxh_u64 XXH_PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
1718static const xxh_u64 XXH_PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
1719static const xxh_u64 XXH_PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
1720
1721#ifdef XXH_OLD_NAMES
1722# define PRIME64_1 XXH_PRIME64_1
1723# define PRIME64_2 XXH_PRIME64_2
1724# define PRIME64_3 XXH_PRIME64_3
1725# define PRIME64_4 XXH_PRIME64_4
1726# define PRIME64_5 XXH_PRIME64_5
1727#endif
1728
1729static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
1730{
1731 acc += input * XXH_PRIME64_2;
1732 acc = XXH_rotl64(acc, 31);
1733 acc *= XXH_PRIME64_1;
1734 return acc;
1735}
1736
1737static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
1738{
1739 val = XXH64_round(0, val);
1740 acc ^= val;
1741 acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
1742 return acc;
1743}
1744
1745static xxh_u64 XXH64_avalanche(xxh_u64 h64)
1746{
1747 h64 ^= h64 >> 33;
1748 h64 *= XXH_PRIME64_2;
1749 h64 ^= h64 >> 29;
1750 h64 *= XXH_PRIME64_3;
1751 h64 ^= h64 >> 32;
1752 return h64;
1753}
1754
1755
1756#define XXH_get64bits(p) XXH_readLE64_align(p, align)
1757
1758static xxh_u64
1759XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
1760{
1761#define XXH_PROCESS1_64 do { \
1762 h64 ^= (*ptr++) * XXH_PRIME64_5; \
1763 h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \
1764} while (0)
1765
1766#define XXH_PROCESS4_64 do { \
1767 h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; \
1768 ptr += 4; \
1769 h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \
1770} while (0)
1771
1772#define XXH_PROCESS8_64 do { \
1773 xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
1774 ptr += 8; \
1775 h64 ^= k1; \
1776 h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4; \
1777} while (0)
1778
1779 /* Rerolled version for 32-bit targets is faster and much smaller. */
1780 if (XXH_REROLL || XXH_REROLL_XXH64) {
1781 len &= 31;
1782 while (len >= 8) {
1783 XXH_PROCESS8_64;
1784 len -= 8;
1785 }
1786 if (len >= 4) {
1787 XXH_PROCESS4_64;
1788 len -= 4;
1789 }
1790 while (len > 0) {
1791 XXH_PROCESS1_64;
1792 --len;
1793 }
1794 return XXH64_avalanche(h64);
1795 } else {
1796 switch(len & 31) {
1797 case 24: XXH_PROCESS8_64;
1798 /* fallthrough */
1799 case 16: XXH_PROCESS8_64;
1800 /* fallthrough */
1801 case 8: XXH_PROCESS8_64;
1802 return XXH64_avalanche(h64);
1803
1804 case 28: XXH_PROCESS8_64;
1805 /* fallthrough */
1806 case 20: XXH_PROCESS8_64;
1807 /* fallthrough */
1808 case 12: XXH_PROCESS8_64;
1809 /* fallthrough */
1810 case 4: XXH_PROCESS4_64;
1811 return XXH64_avalanche(h64);
1812
1813 case 25: XXH_PROCESS8_64;
1814 /* fallthrough */
1815 case 17: XXH_PROCESS8_64;
1816 /* fallthrough */
1817 case 9: XXH_PROCESS8_64;
1818 XXH_PROCESS1_64;
1819 return XXH64_avalanche(h64);
1820
1821 case 29: XXH_PROCESS8_64;
1822 /* fallthrough */
1823 case 21: XXH_PROCESS8_64;
1824 /* fallthrough */
1825 case 13: XXH_PROCESS8_64;
1826 /* fallthrough */
1827 case 5: XXH_PROCESS4_64;
1828 XXH_PROCESS1_64;
1829 return XXH64_avalanche(h64);
1830
1831 case 26: XXH_PROCESS8_64;
1832 /* fallthrough */
1833 case 18: XXH_PROCESS8_64;
1834 /* fallthrough */
1835 case 10: XXH_PROCESS8_64;
1836 XXH_PROCESS1_64;
1837 XXH_PROCESS1_64;
1838 return XXH64_avalanche(h64);
1839
1840 case 30: XXH_PROCESS8_64;
1841 /* fallthrough */
1842 case 22: XXH_PROCESS8_64;
1843 /* fallthrough */
1844 case 14: XXH_PROCESS8_64;
1845 /* fallthrough */
1846 case 6: XXH_PROCESS4_64;
1847 XXH_PROCESS1_64;
1848 XXH_PROCESS1_64;
1849 return XXH64_avalanche(h64);
1850
1851 case 27: XXH_PROCESS8_64;
1852 /* fallthrough */
1853 case 19: XXH_PROCESS8_64;
1854 /* fallthrough */
1855 case 11: XXH_PROCESS8_64;
1856 XXH_PROCESS1_64;
1857 XXH_PROCESS1_64;
1858 XXH_PROCESS1_64;
1859 return XXH64_avalanche(h64);
1860
1861 case 31: XXH_PROCESS8_64;
1862 /* fallthrough */
1863 case 23: XXH_PROCESS8_64;
1864 /* fallthrough */
1865 case 15: XXH_PROCESS8_64;
1866 /* fallthrough */
1867 case 7: XXH_PROCESS4_64;
1868 /* fallthrough */
1869 case 3: XXH_PROCESS1_64;
1870 /* fallthrough */
1871 case 2: XXH_PROCESS1_64;
1872 /* fallthrough */
1873 case 1: XXH_PROCESS1_64;
1874 /* fallthrough */
1875 case 0: return XXH64_avalanche(h64);
1876 }
1877 }
1878 /* impossible to reach */
1879 XXH_ASSERT(0);
1880 return 0; /* unreachable, but some compilers complain without it */
1881}
1882
1883#ifdef XXH_OLD_NAMES
1884# define PROCESS1_64 XXH_PROCESS1_64
1885# define PROCESS4_64 XXH_PROCESS4_64
1886# define PROCESS8_64 XXH_PROCESS8_64
1887#else
1888# undef XXH_PROCESS1_64
1889# undef XXH_PROCESS4_64
1890# undef XXH_PROCESS8_64
1891#endif
1892
1893XXH_FORCE_INLINE xxh_u64
1894XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
1895{
1896 const xxh_u8* bEnd = input + len;
1897 xxh_u64 h64;
1898
1899#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
1900 if (input==NULL) {
1901 len=0;
1902 bEnd=input=(const xxh_u8*)(size_t)32;
1903 }
1904#endif
1905
1906 if (len>=32) {
1907 const xxh_u8* const limit = bEnd - 32;
1908 xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
1909 xxh_u64 v2 = seed + XXH_PRIME64_2;
1910 xxh_u64 v3 = seed + 0;
1911 xxh_u64 v4 = seed - XXH_PRIME64_1;
1912
1913 do {
1914 v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
1915 v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
1916 v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
1917 v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
1918 } while (input<=limit);
1919
1920 h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
1921 h64 = XXH64_mergeRound(h64, v1);
1922 h64 = XXH64_mergeRound(h64, v2);
1923 h64 = XXH64_mergeRound(h64, v3);
1924 h64 = XXH64_mergeRound(h64, v4);
1925
1926 } else {
1927 h64 = seed + XXH_PRIME64_5;
1928 }
1929
1930 h64 += (xxh_u64) len;
1931
1932 return XXH64_finalize(h64, input, len, align);
1933}
1934
1935
1936XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
1937{
1938#if 0
1939 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
1940 XXH64_state_t state;
1941 XXH64_reset(&state, seed);
1942 XXH64_update(&state, (const xxh_u8*)input, len);
1943 return XXH64_digest(&state);
1944
1945#else
1946
1947 if (XXH_FORCE_ALIGN_CHECK) {
1948 if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */
1949 return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
1950 } }
1951
1952 return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
1953
1954#endif
1955}
1956
1957/******* Hash Streaming *******/
1958
1959XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
1960{
1961 return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
1962}
1963XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
1964{
1965 XXH_free(statePtr);
1966 return XXH_OK;
1967}
1968
1969XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
1970{
1971 memcpy(dstState, srcState, sizeof(*dstState));
1972}
1973
1974XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
1975{
1976 XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
1977 memset(&state, 0, sizeof(state));
1978 state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
1979 state.v2 = seed + XXH_PRIME64_2;
1980 state.v3 = seed + 0;
1981 state.v4 = seed - XXH_PRIME64_1;
1982 /* do not write into reserved64, might be removed in a future version */
1983 memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
1984 return XXH_OK;
1985}
1986
1987XXH_PUBLIC_API XXH_errorcode
1988XXH64_update (XXH64_state_t* state, const void* input, size_t len)
1989{
1990 if (input==NULL)
1991#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
1992 return XXH_OK;
1993#else
1994 return XXH_ERROR;
1995#endif
1996
1997 { const xxh_u8* p = (const xxh_u8*)input;
1998 const xxh_u8* const bEnd = p + len;
1999
2000 state->total_len += len;
2001
2002 if (state->memsize + len < 32) { /* fill in tmp buffer */
2003 XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
2004 state->memsize += (xxh_u32)len;
2005 return XXH_OK;
2006 }
2007
2008 if (state->memsize) { /* tmp buffer is full */
2009 XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
2010 state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
2011 state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
2012 state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
2013 state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
2014 p += 32-state->memsize;
2015 state->memsize = 0;
2016 }
2017
2018 if (p+32 <= bEnd) {
2019 const xxh_u8* const limit = bEnd - 32;
2020 xxh_u64 v1 = state->v1;
2021 xxh_u64 v2 = state->v2;
2022 xxh_u64 v3 = state->v3;
2023 xxh_u64 v4 = state->v4;
2024
2025 do {
2026 v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
2027 v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
2028 v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
2029 v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
2030 } while (p<=limit);
2031
2032 state->v1 = v1;
2033 state->v2 = v2;
2034 state->v3 = v3;
2035 state->v4 = v4;
2036 }
2037
2038 if (p < bEnd) {
2039 XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
2040 state->memsize = (unsigned)(bEnd-p);
2041 }
2042 }
2043
2044 return XXH_OK;
2045}
2046
2047
2048XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
2049{
2050 xxh_u64 h64;
2051
2052 if (state->total_len >= 32) {
2053 xxh_u64 const v1 = state->v1;
2054 xxh_u64 const v2 = state->v2;
2055 xxh_u64 const v3 = state->v3;
2056 xxh_u64 const v4 = state->v4;
2057
2058 h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
2059 h64 = XXH64_mergeRound(h64, v1);
2060 h64 = XXH64_mergeRound(h64, v2);
2061 h64 = XXH64_mergeRound(h64, v3);
2062 h64 = XXH64_mergeRound(h64, v4);
2063 } else {
2064 h64 = state->v3 /*seed*/ + XXH_PRIME64_5;
2065 }
2066
2067 h64 += (xxh_u64) state->total_len;
2068
2069 return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
2070}
2071
2072
2073/******* Canonical representation *******/
2074
2075XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
2076{
2077 XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
2078 if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
2079 memcpy(dst, &hash, sizeof(*dst));
2080}
2081
2082XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
2083{
2084 return XXH_readBE64(src);
2085}
2086
2087
2088
2089/* *********************************************************************
2090* XXH3
2091* New generation hash designed for speed on small keys and vectorization
2092************************************************************************ */
2093
2094/* === Compiler specifics === */
2095
2096#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
2097# define XXH_RESTRICT restrict
2098#else
2099/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
2100# define XXH_RESTRICT /* disable */
2101#endif
2102
2103#if (defined(__GNUC__) && (__GNUC__ >= 3)) \
2104 || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
2105 || defined(__clang__)
2106# define XXH_likely(x) __builtin_expect(x, 1)
2107# define XXH_unlikely(x) __builtin_expect(x, 0)
2108#else
2109# define XXH_likely(x) (x)
2110# define XXH_unlikely(x) (x)
2111#endif
2112
2113#if defined(__GNUC__)
2114# if defined(__AVX2__)
2115# include <immintrin.h>
2116# elif defined(__SSE2__)
2117# include <emmintrin.h>
2118# elif defined(__ARM_NEON__) || defined(__ARM_NEON)
2119# define inline __inline__ /* circumvent a clang bug */
2120# include <arm_neon.h>
2121# undef inline
2122# endif
2123#elif defined(_MSC_VER)
2124# include <intrin.h>
2125#endif
2126
2127/*
2128 * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
2129 * remaining a true 64-bit/128-bit hash function.
2130 *
2131 * This is done by prioritizing a subset of 64-bit operations that can be
2132 * emulated without too many steps on the average 32-bit machine.
2133 *
2134 * For example, these two lines seem similar, and run equally fast on 64-bit:
2135 *
2136 * xxh_u64 x;
2137 * x ^= (x >> 47); // good
2138 * x ^= (x >> 13); // bad
2139 *
2140 * However, to a 32-bit machine, there is a major difference.
2141 *
2142 * x ^= (x >> 47) looks like this:
2143 *
2144 * x.lo ^= (x.hi >> (47 - 32));
2145 *
2146 * while x ^= (x >> 13) looks like this:
2147 *
2148 * // note: funnel shifts are not usually cheap.
2149 * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
2150 * x.hi ^= (x.hi >> 13);
2151 *
2152 * The first one is significantly faster than the second, simply because the
2153 * shift is larger than 32. This means:
2154 * - All the bits we need are in the upper 32 bits, so we can ignore the lower
2155 * 32 bits in the shift.
2156 * - The shift result will always fit in the lower 32 bits, and therefore,
2157 * we can ignore the upper 32 bits in the xor.
2158 *
2159 * Thanks to this optimization, XXH3 only requires these features to be efficient:
2160 *
2161 * - Usable unaligned access
2162 * - A 32-bit or 64-bit ALU
2163 * - If 32-bit, a decent ADC instruction
2164 * - A 32 or 64-bit multiply with a 64-bit result
2165 * - For the 128-bit variant, a decent byteswap helps short inputs.
2166 *
2167 * The first two are already required by XXH32, and almost all 32-bit and 64-bit
2168 * platforms which can run XXH32 can run XXH3 efficiently.
2169 *
2170 * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
2171 * notable exception.
2172 *
2173 * First of all, Thumb-1 lacks support for the UMULL instruction which
2174 * performs the important long multiply. This means numerous __aeabi_lmul
2175 * calls.
2176 *
2177 * Second of all, the 8 functional registers are just not enough.
2178 * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
2179 * Lo registers, and this shuffling results in thousands more MOVs than A32.
2180 *
2181 * A32 and T32 don't have this limitation. They can access all 14 registers,
2182 * do a 32->64 multiply with UMULL, and the flexible operand allowing free
2183 * shifts is helpful, too.
2184 *
2185 * Therefore, we do a quick sanity check.
2186 *
2187 * If compiling Thumb-1 for a target which supports ARM instructions, we will
2188 * emit a warning, as it is not a "sane" platform to compile for.
2189 *
2190 * Usually, if this happens, it is because of an accident and you probably need
2191 * to specify -march, as you likely meant to compile for a newer architecture.
2192 *
2193 * Credit: large sections of the vectorial and asm source code paths
2194 * have been contributed by @easyaspi314
2195 */
2196#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
2197# warning "XXH3 is highly inefficient without ARM or Thumb-2."
2198#endif
2199
2200/* ==========================================
2201 * Vectorization detection
2202 * ========================================== */
2203#define XXH_SCALAR 0 /* Portable scalar version */
2204#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */
2205#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */
2206#define XXH_AVX512 3 /* AVX512 for Skylake and Icelake */
2207#define XXH_NEON 4 /* NEON for most ARMv7-A and all AArch64 */
2208#define XXH_VSX 5 /* VSX and ZVector for POWER8/z13 */
2209
2210#ifndef XXH_VECTOR /* can be defined on command line */
2211# if defined(__AVX512F__)
2212# define XXH_VECTOR XXH_AVX512
2213# elif defined(__AVX2__)
2214# define XXH_VECTOR XXH_AVX2
2215# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
2216# define XXH_VECTOR XXH_SSE2
2217# elif defined(__GNUC__) /* msvc support maybe later */ \
2218 && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
2219 && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
2220 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
2221# define XXH_VECTOR XXH_NEON
2222# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
2223 || (defined(__s390x__) && defined(__VEC__)) \
2224 && defined(__GNUC__) /* TODO: IBM XL */
2225# define XXH_VECTOR XXH_VSX
2226# else
2227# define XXH_VECTOR XXH_SCALAR
2228# endif
2229#endif
2230
2231/*
2232 * Controls the alignment of the accumulator,
2233 * for compatibility with aligned vector loads, which are usually faster.
2234 */
2235#ifndef XXH_ACC_ALIGN
2236# if defined(XXH_X86DISPATCH)
2237# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */
2238# elif XXH_VECTOR == XXH_SCALAR /* scalar */
2239# define XXH_ACC_ALIGN 8
2240# elif XXH_VECTOR == XXH_SSE2 /* sse2 */
2241# define XXH_ACC_ALIGN 16
2242# elif XXH_VECTOR == XXH_AVX2 /* avx2 */
2243# define XXH_ACC_ALIGN 32
2244# elif XXH_VECTOR == XXH_NEON /* neon */
2245# define XXH_ACC_ALIGN 16
2246# elif XXH_VECTOR == XXH_VSX /* vsx */
2247# define XXH_ACC_ALIGN 16
2248# elif XXH_VECTOR == XXH_AVX512 /* avx512 */
2249# define XXH_ACC_ALIGN 64
2250# endif
2251#endif
2252
2253#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
2254 || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
2255# define XXH_SEC_ALIGN XXH_ACC_ALIGN
2256#else
2257# define XXH_SEC_ALIGN 8
2258#endif
2259
2260/*
2261 * UGLY HACK:
2262 * GCC usually generates the best code with -O3 for xxHash.
2263 *
2264 * However, when targeting AVX2, it is overzealous in its unrolling resulting
2265 * in code roughly 3/4 the speed of Clang.
2266 *
2267 * There are other issues, such as GCC splitting _mm256_loadu_si256 into
2268 * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
2269 * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
2270 *
2271 * That is why when compiling the AVX2 version, it is recommended to use either
2272 * -O2 -mavx2 -march=haswell
2273 * or
2274 * -O2 -mavx2 -mno-avx256-split-unaligned-load
2275 * for decent performance, or to use Clang instead.
2276 *
2277 * Fortunately, we can control the first one with a pragma that forces GCC into
2278 * -O2, but the other one we can't control without "failed to inline always
2279 * inline function due to target mismatch" warnings.
2280 */
2281#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
2282 && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
2283 && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
2284# pragma GCC push_options
2285# pragma GCC optimize("-O2")
2286#endif
2287
2288
2289#if XXH_VECTOR == XXH_NEON
2290/*
2291 * NEON's setup for vmlal_u32 is a little more complicated than it is on
2292 * SSE2, AVX2, and VSX.
2293 *
2294 * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
2295 *
2296 * To do the same operation, the 128-bit 'Q' register needs to be split into
2297 * two 64-bit 'D' registers, performing this operation::
2298 *
2299 * [ a | b ]
2300 * | '---------. .--------' |
2301 * | x |
2302 * | .---------' '--------. |
2303 * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]
2304 *
2305 * Due to significant changes in aarch64, the fastest method for aarch64 is
2306 * completely different than the fastest method for ARMv7-A.
2307 *
2308 * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
2309 * D11 will modify the high half of Q5. This is similar to how modifying AH
2310 * will only affect bits 8-15 of AX on x86.
2311 *
2312 * VZIP takes two registers, and puts even lanes in one register and odd lanes
2313 * in the other.
2314 *
2315 * On ARMv7-A, this strangely modifies both parameters in place instead of
2316 * taking the usual 3-operand form.
2317 *
2318 * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
2319 * lower and upper halves of the Q register to end up with the high and low
2320 * halves where we want - all in one instruction.
2321 *
2322 * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
2323 *
2324 * Unfortunately we need inline assembly for this: Instructions modifying two
2325 * registers at once is not possible in GCC or Clang's IR, and they have to
2326 * create a copy.
2327 *
2328 * aarch64 requires a different approach.
2329 *
2330 * In order to make it easier to write a decent compiler for aarch64, many
2331 * quirks were removed, such as conditional execution.
2332 *
2333 * NEON was also affected by this.
2334 *
2335 * aarch64 cannot access the high bits of a Q-form register, and writes to a
2336 * D-form register zero the high bits, similar to how writes to W-form scalar
2337 * registers (or DWORD registers on x86_64) work.
2338 *
2339 * The formerly free vget_high intrinsics now require a vext (with a few
2340 * exceptions)
2341 *
2342 * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
2343 * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
2344 * operand.
2345 *
2346 * The equivalent of the VZIP.32 on the lower and upper halves would be this
2347 * mess:
2348 *
2349 * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
2350 * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
2351 * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
2352 *
2353 * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
2354 *
2355 * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
2356 * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
2357 *
2358 * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
2359 */
2360
2361/*
2362 * Function-like macro:
2363 * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
2364 * {
2365 * outLo = (uint32x2_t)(in & 0xFFFFFFFF);
2366 * outHi = (uint32x2_t)(in >> 32);
2367 * in = UNDEFINED;
2368 * }
2369 */
2370# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
2371 && defined(__GNUC__) \
2372 && !defined(__aarch64__) && !defined(__arm64__)
2373# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
2374 do { \
2375 /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
2376 /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
2377 /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
2378 __asm__("vzip.32 %e0, %f0" : "+w" (in)); \
2379 (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \
2380 (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
2381 } while (0)
2382# else
2383# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
2384 do { \
2385 (outLo) = vmovn_u64 (in); \
2386 (outHi) = vshrn_n_u64 ((in), 32); \
2387 } while (0)
2388# endif
2389#endif /* XXH_VECTOR == XXH_NEON */
2390
2391/*
2392 * VSX and Z Vector helpers.
2393 *
2394 * This is very messy, and any pull requests to clean this up are welcome.
2395 *
2396 * There are a lot of problems with supporting VSX and s390x, due to
2397 * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
2398 */
2399#if XXH_VECTOR == XXH_VSX
2400# if defined(__s390x__)
2401# include <s390intrin.h>
2402# else
2403/* gcc's altivec.h can have the unwanted consequence to unconditionally
2404 * #define bool, vector, and pixel keywords,
2405 * with bad consequences for programs already using these keywords for other purposes.
2406 * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.
2407 * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,
2408 * but it seems that, in some cases, it isn't.
2409 * Force the build macro to be defined, so that keywords are not altered.
2410 */
2411# if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
2412# define __APPLE_ALTIVEC__
2413# endif
2414# include <altivec.h>
2415# endif
2416
2417typedef __vector unsigned long long xxh_u64x2;
2418typedef __vector unsigned char xxh_u8x16;
2419typedef __vector unsigned xxh_u32x4;
2420
2421# ifndef XXH_VSX_BE
2422# if defined(__BIG_ENDIAN__) \
2423 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
2424# define XXH_VSX_BE 1
2425# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
2426# warning "-maltivec=be is not recommended. Please use native endianness."
2427# define XXH_VSX_BE 1
2428# else
2429# define XXH_VSX_BE 0
2430# endif
2431# endif /* !defined(XXH_VSX_BE) */
2432
2433# if XXH_VSX_BE
2434/* A wrapper for POWER9's vec_revb. */
2435# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
2436# define XXH_vec_revb vec_revb
2437# else
2438XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
2439{
2440 xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
2441 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
2442 return vec_perm(val, val, vByteSwap);
2443}
2444# endif
2445# endif /* XXH_VSX_BE */
2446
2447/*
2448 * Performs an unaligned load and byte swaps it on big endian.
2449 */
2450XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
2451{
2452 xxh_u64x2 ret;
2453 memcpy(&ret, ptr, sizeof(xxh_u64x2));
2454# if XXH_VSX_BE
2455 ret = XXH_vec_revb(ret);
2456# endif
2457 return ret;
2458}
2459
2460/*
2461 * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
2462 *
2463 * These intrinsics weren't added until GCC 8, despite existing for a while,
2464 * and they are endian dependent. Also, their meaning swap depending on version.
2465 * */
2466# if defined(__s390x__)
2467 /* s390x is always big endian, no issue on this platform */
2468# define XXH_vec_mulo vec_mulo
2469# define XXH_vec_mule vec_mule
2470# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
2471/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
2472# define XXH_vec_mulo __builtin_altivec_vmulouw
2473# define XXH_vec_mule __builtin_altivec_vmuleuw
2474# else
2475/* gcc needs inline assembly */
2476/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
2477XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
2478{
2479 xxh_u64x2 result;
2480 __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
2481 return result;
2482}
2483XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
2484{
2485 xxh_u64x2 result;
2486 __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
2487 return result;
2488}
2489# endif /* XXH_vec_mulo, XXH_vec_mule */
2490#endif /* XXH_VECTOR == XXH_VSX */
2491
2492
2493/* prefetch
2494 * can be disabled, by declaring XXH_NO_PREFETCH build macro */
2495#if defined(XXH_NO_PREFETCH)
2496# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
2497#else
2498# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
2499# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
2500# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
2501# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
2502# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
2503# else
2504# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
2505# endif
2506#endif /* XXH_NO_PREFETCH */
2507
2508
2509/* ==========================================
2510 * XXH3 default settings
2511 * ========================================== */
2512
2513#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
2514
2515#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
2516# error "default keyset is not large enough"
2517#endif
2518
2519/* Pseudorandom secret taken directly from FARSH */
2520XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
2521 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
2522 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
2523 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
2524 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
2525 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
2526 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
2527 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
2528 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
2529 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
2530 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
2531 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
2532 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
2533};
2534
2535
2536#ifdef XXH_OLD_NAMES
2537# define kSecret XXH3_kSecret
2538#endif
2539
2540/*
2541 * Calculates a 32-bit to 64-bit long multiply.
2542 *
2543 * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
2544 * need to (but it shouldn't need to anyways, it is about 7 instructions to do
2545 * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
2546 * use that instead of the normal method.
2547 *
2548 * If you are compiling for platforms like Thumb-1 and don't have a better option,
2549 * you may also want to write your own long multiply routine here.
2550 *
2551 * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
2552 * {
2553 * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
2554 * }
2555 */
2556#if defined(_MSC_VER) && defined(_M_IX86)
2557# include <intrin.h>
2558# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
2559#else
2560/*
2561 * Downcast + upcast is usually better than masking on older compilers like
2562 * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
2563 *
2564 * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
2565 * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
2566 */
2567# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
2568#endif
2569
2570/*
2571 * Calculates a 64->128-bit long multiply.
2572 *
2573 * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
2574 */
2575static XXH128_hash_t
2576XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
2577{
2578 /*
2579 * GCC/Clang __uint128_t method.
2580 *
2581 * On most 64-bit targets, GCC and Clang define a __uint128_t type.
2582 * This is usually the best way as it usually uses a native long 64-bit
2583 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
2584 *
2585 * Usually.
2586 *
2587 * Despite being a 32-bit platform, Clang (and emscripten) define this type
2588 * despite not having the arithmetic for it. This results in a laggy
2589 * compiler builtin call which calculates a full 128-bit multiply.
2590 * In that case it is best to use the portable one.
2591 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
2592 */
2593#if defined(__GNUC__) && !defined(__wasm__) \
2594 && defined(__SIZEOF_INT128__) \
2595 || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
2596
2597 __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
2598 XXH128_hash_t r128;
2599 r128.low64 = (xxh_u64)(product);
2600 r128.high64 = (xxh_u64)(product >> 64);
2601 return r128;
2602
2603 /*
2604 * MSVC for x64's _umul128 method.
2605 *
2606 * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
2607 *
2608 * This compiles to single operand MUL on x64.
2609 */
2610#elif defined(_M_X64) || defined(_M_IA64)
2611
2612#ifndef _MSC_VER
2613# pragma intrinsic(_umul128)
2614#endif
2615 xxh_u64 product_high;
2616 xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
2617 XXH128_hash_t r128;
2618 r128.low64 = product_low;
2619 r128.high64 = product_high;
2620 return r128;
2621
2622#else
2623 /*
2624 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
2625 *
2626 * This is a fast and simple grade school multiply, which is shown below
2627 * with base 10 arithmetic instead of base 0x100000000.
2628 *
2629 * 9 3 // D2 lhs = 93
2630 * x 7 5 // D2 rhs = 75
2631 * ----------
2632 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
2633 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
2634 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
2635 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
2636 * ---------
2637 * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
2638 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
2639 * ---------
2640 * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
2641 *
2642 * The reasons for adding the products like this are:
2643 * 1. It avoids manual carry tracking. Just like how
2644 * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
2645 * This avoids a lot of complexity.
2646 *
2647 * 2. It hints for, and on Clang, compiles to, the powerful UMAAL
2648 * instruction available in ARM's Digital Signal Processing extension
2649 * in 32-bit ARMv6 and later, which is shown below:
2650 *
2651 * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
2652 * {
2653 * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
2654 * *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
2655 * *RdHi = (xxh_u32)(product >> 32);
2656 * }
2657 *
2658 * This instruction was designed for efficient long multiplication, and
2659 * allows this to be calculated in only 4 instructions at speeds
2660 * comparable to some 64-bit ALUs.
2661 *
2662 * 3. It isn't terrible on other platforms. Usually this will be a couple
2663 * of 32-bit ADD/ADCs.
2664 */
2665
2666 /* First calculate all of the cross products. */
2667 xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
2668 xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
2669 xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
2670 xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
2671
2672 /* Now add the products together. These will never overflow. */
2673 xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
2674 xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
2675 xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
2676
2677 XXH128_hash_t r128;
2678 r128.low64 = lower;
2679 r128.high64 = upper;
2680 return r128;
2681#endif
2682}
2683
2684/*
2685 * Does a 64-bit to 128-bit multiply, then XOR folds it.
2686 *
2687 * The reason for the separate function is to prevent passing too many structs
2688 * around by value. This will hopefully inline the multiply, but we don't force it.
2689 */
2690static xxh_u64
2691XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
2692{
2693 XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
2694 return product.low64 ^ product.high64;
2695}
2696
2697/* Seems to produce slightly better code on GCC for some reason. */
2698XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
2699{
2700 XXH_ASSERT(0 <= shift && shift < 64);
2701 return v64 ^ (v64 >> shift);
2702}
2703
2704/*
2705 * This is a fast avalanche stage,
2706 * suitable when input bits are already partially mixed
2707 */
2708static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
2709{
2710 h64 = XXH_xorshift64(h64, 37);
2711 h64 *= 0x165667919E3779F9ULL;
2712 h64 = XXH_xorshift64(h64, 32);
2713 return h64;
2714}
2715
2716/*
2717 * This is a stronger avalanche,
2718 * inspired by Pelle Evensen's rrmxmx
2719 * preferable when input has not been previously mixed
2720 */
2721static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
2722{
2723 /* this mix is inspired by Pelle Evensen's rrmxmx */
2724 h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
2725 h64 *= 0x9FB21C651E98DF25ULL;
2726 h64 ^= (h64 >> 35) + len ;
2727 h64 *= 0x9FB21C651E98DF25ULL;
2728 return XXH_xorshift64(h64, 28);
2729}
2730
2731
2732/* ==========================================
2733 * Short keys
2734 * ==========================================
2735 * One of the shortcomings of XXH32 and XXH64 was that their performance was
2736 * sub-optimal on short lengths. It used an iterative algorithm which strongly
2737 * favored lengths that were a multiple of 4 or 8.
2738 *
2739 * Instead of iterating over individual inputs, we use a set of single shot
2740 * functions which piece together a range of lengths and operate in constant time.
2741 *
2742 * Additionally, the number of multiplies has been significantly reduced. This
2743 * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
2744 *
2745 * Depending on the platform, this may or may not be faster than XXH32, but it
2746 * is almost guaranteed to be faster than XXH64.
2747 */
2748
2749/*
2750 * At very short lengths, there isn't enough input to fully hide secrets, or use
2751 * the entire secret.
2752 *
2753 * There is also only a limited amount of mixing we can do before significantly
2754 * impacting performance.
2755 *
2756 * Therefore, we use different sections of the secret and always mix two secret
2757 * samples with an XOR. This should have no effect on performance on the
2758 * seedless or withSeed variants because everything _should_ be constant folded
2759 * by modern compilers.
2760 *
2761 * The XOR mixing hides individual parts of the secret and increases entropy.
2762 *
2763 * This adds an extra layer of strength for custom secrets.
2764 */
2765XXH_FORCE_INLINE XXH64_hash_t
2766XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
2767{
2768 XXH_ASSERT(input != NULL);
2769 XXH_ASSERT(1 <= len && len <= 3);
2770 XXH_ASSERT(secret != NULL);
2771 /*
2772 * len = 1: combined = { input[0], 0x01, input[0], input[0] }
2773 * len = 2: combined = { input[1], 0x02, input[0], input[1] }
2774 * len = 3: combined = { input[2], 0x03, input[0], input[1] }
2775 */
2776 { xxh_u8 const c1 = input[0];
2777 xxh_u8 const c2 = input[len >> 1];
2778 xxh_u8 const c3 = input[len - 1];
2779 xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24)
2780 | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
2781 xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
2782 xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
2783 return XXH64_avalanche(keyed);
2784 }
2785}
2786
2787XXH_FORCE_INLINE XXH64_hash_t
2788XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
2789{
2790 XXH_ASSERT(input != NULL);
2791 XXH_ASSERT(secret != NULL);
2792 XXH_ASSERT(4 <= len && len < 8);
2793 seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
2794 { xxh_u32 const input1 = XXH_readLE32(input);
2795 xxh_u32 const input2 = XXH_readLE32(input + len - 4);
2796 xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
2797 xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
2798 xxh_u64 const keyed = input64 ^ bitflip;
2799 return XXH3_rrmxmx(keyed, len);
2800 }
2801}
2802
2803XXH_FORCE_INLINE XXH64_hash_t
2804XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
2805{
2806 XXH_ASSERT(input != NULL);
2807 XXH_ASSERT(secret != NULL);
2808 XXH_ASSERT(8 <= len && len <= 16);
2809 { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
2810 xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
2811 xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
2812 xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
2813 xxh_u64 const acc = len
2814 + XXH_swap64(input_lo) + input_hi
2815 + XXH3_mul128_fold64(input_lo, input_hi);
2816 return XXH3_avalanche(acc);
2817 }
2818}
2819
2820XXH_FORCE_INLINE XXH64_hash_t
2821XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
2822{
2823 XXH_ASSERT(len <= 16);
2824 { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed);
2825 if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
2826 if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
2827 return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
2828 }
2829}
2830
2831/*
2832 * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
2833 * multiplication by zero, affecting hashes of lengths 17 to 240.
2834 *
2835 * However, they are very unlikely.
2836 *
2837 * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
2838 * unseeded non-cryptographic hashes, it does not attempt to defend itself
2839 * against specially crafted inputs, only random inputs.
2840 *
2841 * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
2842 * cancelling out the secret is taken an arbitrary number of times (addressed
2843 * in XXH3_accumulate_512), this collision is very unlikely with random inputs
2844 * and/or proper seeding:
2845 *
2846 * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
2847 * function that is only called up to 16 times per hash with up to 240 bytes of
2848 * input.
2849 *
2850 * This is not too bad for a non-cryptographic hash function, especially with
2851 * only 64 bit outputs.
2852 *
2853 * The 128-bit variant (which trades some speed for strength) is NOT affected
2854 * by this, although it is always a good idea to use a proper seed if you care
2855 * about strength.
2856 */
2857XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
2858 const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
2859{
2860#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
2861 && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \
2862 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */
2863 /*
2864 * UGLY HACK:
2865 * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
2866 * slower code.
2867 *
2868 * By forcing seed64 into a register, we disrupt the cost model and
2869 * cause it to scalarize. See `XXH32_round()`
2870 *
2871 * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
2872 * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
2873 * GCC 9.2, despite both emitting scalar code.
2874 *
2875 * GCC generates much better scalar code than Clang for the rest of XXH3,
2876 * which is why finding a more optimal codepath is an interest.
2877 */
2878 __asm__ ("" : "+r" (seed64));
2879#endif
2880 { xxh_u64 const input_lo = XXH_readLE64(input);
2881 xxh_u64 const input_hi = XXH_readLE64(input+8);
2882 return XXH3_mul128_fold64(
2883 input_lo ^ (XXH_readLE64(secret) + seed64),
2884 input_hi ^ (XXH_readLE64(secret+8) - seed64)
2885 );
2886 }
2887}
2888
2889/* For mid range keys, XXH3 uses a Mum-hash variant. */
2890XXH_FORCE_INLINE XXH64_hash_t
2891XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
2892 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
2893 XXH64_hash_t seed)
2894{
2895 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
2896 XXH_ASSERT(16 < len && len <= 128);
2897
2898 { xxh_u64 acc = len * XXH_PRIME64_1;
2899 if (len > 32) {
2900 if (len > 64) {
2901 if (len > 96) {
2902 acc += XXH3_mix16B(input+48, secret+96, seed);
2903 acc += XXH3_mix16B(input+len-64, secret+112, seed);
2904 }
2905 acc += XXH3_mix16B(input+32, secret+64, seed);
2906 acc += XXH3_mix16B(input+len-48, secret+80, seed);
2907 }
2908 acc += XXH3_mix16B(input+16, secret+32, seed);
2909 acc += XXH3_mix16B(input+len-32, secret+48, seed);
2910 }
2911 acc += XXH3_mix16B(input+0, secret+0, seed);
2912 acc += XXH3_mix16B(input+len-16, secret+16, seed);
2913
2914 return XXH3_avalanche(acc);
2915 }
2916}
2917
2918#define XXH3_MIDSIZE_MAX 240
2919
2920XXH_NO_INLINE XXH64_hash_t
2921XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
2922 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
2923 XXH64_hash_t seed)
2924{
2925 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
2926 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
2927
2928 #define XXH3_MIDSIZE_STARTOFFSET 3
2929 #define XXH3_MIDSIZE_LASTOFFSET 17
2930
2931 { xxh_u64 acc = len * XXH_PRIME64_1;
2932 int const nbRounds = (int)len / 16;
2933 int i;
2934 for (i=0; i<8; i++) {
2935 acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
2936 }
2937 acc = XXH3_avalanche(acc);
2938 XXH_ASSERT(nbRounds >= 8);
2939#if defined(__clang__) /* Clang */ \
2940 && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
2941 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
2942 /*
2943 * UGLY HACK:
2944 * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
2945 * In everywhere else, it uses scalar code.
2946 *
2947 * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
2948 * would still be slower than UMAAL (see XXH_mult64to128).
2949 *
2950 * Unfortunately, Clang doesn't handle the long multiplies properly and
2951 * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
2952 * scalarized into an ugly mess of VMOV.32 instructions.
2953 *
2954 * This mess is difficult to avoid without turning autovectorization
2955 * off completely, but they are usually relatively minor and/or not
2956 * worth it to fix.
2957 *
2958 * This loop is the easiest to fix, as unlike XXH32, this pragma
2959 * _actually works_ because it is a loop vectorization instead of an
2960 * SLP vectorization.
2961 */
2962 #pragma clang loop vectorize(disable)
2963#endif
2964 for (i=8 ; i < nbRounds; i++) {
2965 acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
2966 }
2967 /* last bytes */
2968 acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
2969 return XXH3_avalanche(acc);
2970 }
2971}
2972
2973
2974/* ======= Long Keys ======= */
2975
2976#define XXH_STRIPE_LEN 64
2977#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
2978#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
2979
2980#ifdef XXH_OLD_NAMES
2981# define STRIPE_LEN XXH_STRIPE_LEN
2982# define ACC_NB XXH_ACC_NB
2983#endif
2984
2985XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
2986{
2987 if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
2988 memcpy(dst, &v64, sizeof(v64));
2989}
2990
2991/* Several intrinsic functions below are supposed to accept __int64 as argument,
2992 * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
2993 * However, several environments do not define __int64 type,
2994 * requiring a workaround.
2995 */
2996#if !defined (__VMS) \
2997 && (defined (__cplusplus) \
2998 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
2999 typedef int64_t xxh_i64;
3000#else
3001 /* the following type must have a width of 64-bit */
3002 typedef long long xxh_i64;
3003#endif
3004
3005/*
3006 * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
3007 *
3008 * It is a hardened version of UMAC, based off of FARSH's implementation.
3009 *
3010 * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
3011 * implementations, and it is ridiculously fast.
3012 *
3013 * We harden it by mixing the original input to the accumulators as well as the product.
3014 *
3015 * This means that in the (relatively likely) case of a multiply by zero, the
3016 * original input is preserved.
3017 *
3018 * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
3019 * cross-pollination, as otherwise the upper and lower halves would be
3020 * essentially independent.
3021 *
3022 * This doesn't matter on 64-bit hashes since they all get merged together in
3023 * the end, so we skip the extra step.
3024 *
3025 * Both XXH3_64bits and XXH3_128bits use this subroutine.
3026 */
3027
3028#if (XXH_VECTOR == XXH_AVX512) || defined(XXH_X86DISPATCH)
3029
3030#ifndef XXH_TARGET_AVX512
3031# define XXH_TARGET_AVX512 /* disable attribute target */
3032#endif
3033
3034XXH_FORCE_INLINE XXH_TARGET_AVX512 void
3035XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
3036 const void* XXH_RESTRICT input,
3037 const void* XXH_RESTRICT secret)
3038{
3039 XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;
3040 XXH_ASSERT((((size_t)acc) & 63) == 0);
3041 XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
3042
3043 {
3044 /* data_vec = input[0]; */
3045 __m512i const data_vec = _mm512_loadu_si512 (input);
3046 /* key_vec = secret[0]; */
3047 __m512i const key_vec = _mm512_loadu_si512 (secret);
3048 /* data_key = data_vec ^ key_vec; */
3049 __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
3050 /* data_key_lo = data_key >> 32; */
3051 __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
3052 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
3053 __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);
3054 /* xacc[0] += swap(data_vec); */
3055 __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
3056 __m512i const sum = _mm512_add_epi64(*xacc, data_swap);
3057 /* xacc[0] += product; */
3058 *xacc = _mm512_add_epi64(product, sum);
3059 }
3060}
3061
3062/*
3063 * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
3064 *
3065 * Multiplication isn't perfect, as explained by Google in HighwayHash:
3066 *
3067 * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
3068 * // varying degrees. In descending order of goodness, bytes
3069 * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
3070 * // As expected, the upper and lower bytes are much worse.
3071 *
3072 * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
3073 *
3074 * Since our algorithm uses a pseudorandom secret to add some variance into the
3075 * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
3076 *
3077 * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
3078 * extraction.
3079 *
3080 * Both XXH3_64bits and XXH3_128bits use this subroutine.
3081 */
3082
3083XXH_FORCE_INLINE XXH_TARGET_AVX512 void
3084XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3085{
3086 XXH_ASSERT((((size_t)acc) & 63) == 0);
3087 XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
3088 { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
3089 const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
3090
3091 /* xacc[0] ^= (xacc[0] >> 47) */
3092 __m512i const acc_vec = *xacc;
3093 __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);
3094 __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);
3095 /* xacc[0] ^= secret; */
3096 __m512i const key_vec = _mm512_loadu_si512 (secret);
3097 __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
3098
3099 /* xacc[0] *= XXH_PRIME32_1; */
3100 __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
3101 __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);
3102 __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);
3103 *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
3104 }
3105}
3106
3107XXH_FORCE_INLINE XXH_TARGET_AVX512 void
3108XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
3109{
3110 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
3111 XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
3112 XXH_ASSERT(((size_t)customSecret & 63) == 0);
3113 (void)(&XXH_writeLE64);
3114 { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
3115 __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
3116
3117 XXH_ALIGN(64) const __m512i* const src = (const __m512i*) XXH3_kSecret;
3118 XXH_ALIGN(64) __m512i* const dest = ( __m512i*) customSecret;
3119 int i;
3120 for (i=0; i < nbRounds; ++i) {
3121 /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
3122 * this will warn "discards ‘const’ qualifier". */
3123 union {
3124 XXH_ALIGN(64) const __m512i* cp;
3125 XXH_ALIGN(64) void* p;
3126 } remote_const_void;
3127 remote_const_void.cp = src + i;
3128 dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
3129 } }
3130}
3131
3132#endif
3133
3134#if (XXH_VECTOR == XXH_AVX2) || defined(XXH_X86DISPATCH)
3135
3136#ifndef XXH_TARGET_AVX2
3137# define XXH_TARGET_AVX2 /* disable attribute target */
3138#endif
3139
3140XXH_FORCE_INLINE XXH_TARGET_AVX2 void
3141XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
3142 const void* XXH_RESTRICT input,
3143 const void* XXH_RESTRICT secret)
3144{
3145 XXH_ASSERT((((size_t)acc) & 31) == 0);
3146 { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
3147 /* Unaligned. This is mainly for pointer arithmetic, and because
3148 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3149 const __m256i* const xinput = (const __m256i *) input;
3150 /* Unaligned. This is mainly for pointer arithmetic, and because
3151 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3152 const __m256i* const xsecret = (const __m256i *) secret;
3153
3154 size_t i;
3155 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
3156 /* data_vec = xinput[i]; */
3157 __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
3158 /* key_vec = xsecret[i]; */
3159 __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
3160 /* data_key = data_vec ^ key_vec; */
3161 __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
3162 /* data_key_lo = data_key >> 32; */
3163 __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
3164 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
3165 __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);
3166 /* xacc[i] += swap(data_vec); */
3167 __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
3168 __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
3169 /* xacc[i] += product; */
3170 xacc[i] = _mm256_add_epi64(product, sum);
3171 } }
3172}
3173
3174XXH_FORCE_INLINE XXH_TARGET_AVX2 void
3175XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3176{
3177 XXH_ASSERT((((size_t)acc) & 31) == 0);
3178 { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
3179 /* Unaligned. This is mainly for pointer arithmetic, and because
3180 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3181 const __m256i* const xsecret = (const __m256i *) secret;
3182 const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
3183
3184 size_t i;
3185 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
3186 /* xacc[i] ^= (xacc[i] >> 47) */
3187 __m256i const acc_vec = xacc[i];
3188 __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);
3189 __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);
3190 /* xacc[i] ^= xsecret; */
3191 __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
3192 __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
3193
3194 /* xacc[i] *= XXH_PRIME32_1; */
3195 __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
3196 __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
3197 __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
3198 xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
3199 }
3200 }
3201}
3202
3203XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
3204{
3205 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
3206 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
3207 XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
3208 (void)(&XXH_writeLE64);
3209 XXH_PREFETCH(customSecret);
3210 { __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, -(xxh_i64)seed64, (xxh_i64)seed64);
3211
3212 XXH_ALIGN(64) const __m256i* const src = (const __m256i*) XXH3_kSecret;
3213 XXH_ALIGN(64) __m256i* dest = ( __m256i*) customSecret;
3214
3215# if defined(__GNUC__) || defined(__clang__)
3216 /*
3217 * On GCC & Clang, marking 'dest' as modified will cause the compiler:
3218 * - do not extract the secret from sse registers in the internal loop
3219 * - use less common registers, and avoid pushing these reg into stack
3220 * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
3221 * customSecret, and on aarch64, this prevented LDP from merging two
3222 * loads together for free. Putting the loads together before the stores
3223 * properly generates LDP.
3224 */
3225 __asm__("" : "+r" (dest));
3226# endif
3227
3228 /* GCC -O2 need unroll loop manually */
3229 dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
3230 dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
3231 dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
3232 dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
3233 dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
3234 dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
3235 }
3236}
3237
3238#endif
3239
3240#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
3241
3242#ifndef XXH_TARGET_SSE2
3243# define XXH_TARGET_SSE2 /* disable attribute target */
3244#endif
3245
3246XXH_FORCE_INLINE XXH_TARGET_SSE2 void
3247XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
3248 const void* XXH_RESTRICT input,
3249 const void* XXH_RESTRICT secret)
3250{
3251 /* SSE2 is just a half-scale version of the AVX2 version. */
3252 XXH_ASSERT((((size_t)acc) & 15) == 0);
3253 { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
3254 /* Unaligned. This is mainly for pointer arithmetic, and because
3255 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3256 const __m128i* const xinput = (const __m128i *) input;
3257 /* Unaligned. This is mainly for pointer arithmetic, and because
3258 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3259 const __m128i* const xsecret = (const __m128i *) secret;
3260
3261 size_t i;
3262 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
3263 /* data_vec = xinput[i]; */
3264 __m128i const data_vec = _mm_loadu_si128 (xinput+i);
3265 /* key_vec = xsecret[i]; */
3266 __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
3267 /* data_key = data_vec ^ key_vec; */
3268 __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
3269 /* data_key_lo = data_key >> 32; */
3270 __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
3271 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
3272 __m128i const product = _mm_mul_epu32 (data_key, data_key_lo);
3273 /* xacc[i] += swap(data_vec); */
3274 __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
3275 __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
3276 /* xacc[i] += product; */
3277 xacc[i] = _mm_add_epi64(product, sum);
3278 } }
3279}
3280
3281XXH_FORCE_INLINE XXH_TARGET_SSE2 void
3282XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3283{
3284 XXH_ASSERT((((size_t)acc) & 15) == 0);
3285 { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
3286 /* Unaligned. This is mainly for pointer arithmetic, and because
3287 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3288 const __m128i* const xsecret = (const __m128i *) secret;
3289 const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
3290
3291 size_t i;
3292 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
3293 /* xacc[i] ^= (xacc[i] >> 47) */
3294 __m128i const acc_vec = xacc[i];
3295 __m128i const shifted = _mm_srli_epi64 (acc_vec, 47);
3296 __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);
3297 /* xacc[i] ^= xsecret[i]; */
3298 __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
3299 __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
3300
3301 /* xacc[i] *= XXH_PRIME32_1; */
3302 __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
3303 __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);
3304 __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);
3305 xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
3306 }
3307 }
3308}
3309
3310XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
3311{
3312 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
3313 (void)(&XXH_writeLE64);
3314 { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
3315
3316# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
3317 // MSVC 32bit mode does not support _mm_set_epi64x before 2015
3318 XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, -(xxh_i64)seed64 };
3319 __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
3320# else
3321 __m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
3322# endif
3323 int i;
3324
3325 XXH_ALIGN(64) const float* const src = (float const*) XXH3_kSecret;
3326 XXH_ALIGN(XXH_SEC_ALIGN) __m128i* dest = (__m128i*) customSecret;
3327# if defined(__GNUC__) || defined(__clang__)
3328 /*
3329 * On GCC & Clang, marking 'dest' as modified will cause the compiler:
3330 * - do not extract the secret from sse registers in the internal loop
3331 * - use less common registers, and avoid pushing these reg into stack
3332 */
3333 __asm__("" : "+r" (dest));
3334# endif
3335
3336 for (i=0; i < nbRounds; ++i) {
3337 dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed);
3338 } }
3339}
3340
3341#endif
3342
3343#if (XXH_VECTOR == XXH_NEON)
3344
3345XXH_FORCE_INLINE void
3346XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
3347 const void* XXH_RESTRICT input,
3348 const void* XXH_RESTRICT secret)
3349{
3350 XXH_ASSERT((((size_t)acc) & 15) == 0);
3351 {
3352 XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
3353 /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
3354 uint8_t const* const xinput = (const uint8_t *) input;
3355 uint8_t const* const xsecret = (const uint8_t *) secret;
3356
3357 size_t i;
3358 for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
3359 /* data_vec = xinput[i]; */
3360 uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
3361 /* key_vec = xsecret[i]; */
3362 uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
3363 uint64x2_t data_key;
3364 uint32x2_t data_key_lo, data_key_hi;
3365 /* xacc[i] += swap(data_vec); */
3366 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
3367 uint64x2_t const swapped = vextq_u64(data64, data64, 1);
3368 xacc[i] = vaddq_u64 (xacc[i], swapped);
3369 /* data_key = data_vec ^ key_vec; */
3370 data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
3371 /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
3372 * data_key_hi = (uint32x2_t) (data_key >> 32);
3373 * data_key = UNDEFINED; */
3374 XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
3375 /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
3376 xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
3377
3378 }
3379 }
3380}
3381
3382XXH_FORCE_INLINE void
3383XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3384{
3385 XXH_ASSERT((((size_t)acc) & 15) == 0);
3386
3387 { uint64x2_t* xacc = (uint64x2_t*) acc;
3388 uint8_t const* xsecret = (uint8_t const*) secret;
3389 uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);
3390
3391 size_t i;
3392 for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
3393 /* xacc[i] ^= (xacc[i] >> 47); */
3394 uint64x2_t acc_vec = xacc[i];
3395 uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
3396 uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
3397
3398 /* xacc[i] ^= xsecret[i]; */
3399 uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
3400 uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
3401
3402 /* xacc[i] *= XXH_PRIME32_1 */
3403 uint32x2_t data_key_lo, data_key_hi;
3404 /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
3405 * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
3406 * xacc[i] = UNDEFINED; */
3407 XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
3408 { /*
3409 * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
3410 *
3411 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
3412 * incorrectly "optimize" this:
3413 * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
3414 * shifted = vshll_n_u32(tmp, 32);
3415 * to this:
3416 * tmp = "vmulq_u64"(a, b); // no such thing!
3417 * shifted = vshlq_n_u64(tmp, 32);
3418 *
3419 * However, unlike SSE, Clang lacks a 64-bit multiply routine
3420 * for NEON, and it scalarizes two 64-bit multiplies instead.
3421 *
3422 * vmull_u32 has the same timing as vmul_u32, and it avoids
3423 * this bug completely.
3424 * See https://bugs.llvm.org/show_bug.cgi?id=39967
3425 */
3426 uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
3427 /* xacc[i] = prod_hi << 32; */
3428 xacc[i] = vshlq_n_u64(prod_hi, 32);
3429 /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
3430 xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
3431 }
3432 } }
3433}
3434
3435#endif
3436
3437#if (XXH_VECTOR == XXH_VSX)
3438
3439XXH_FORCE_INLINE void
3440XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
3441 const void* XXH_RESTRICT input,
3442 const void* XXH_RESTRICT secret)
3443{
3444 xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */
3445 xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */
3446 xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */
3447 xxh_u64x2 const v32 = { 32, 32 };
3448 size_t i;
3449 for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
3450 /* data_vec = xinput[i]; */
3451 xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
3452 /* key_vec = xsecret[i]; */
3453 xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
3454 xxh_u64x2 const data_key = data_vec ^ key_vec;
3455 /* shuffled = (data_key << 32) | (data_key >> 32); */
3456 xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
3457 /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
3458 xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
3459 xacc[i] += product;
3460
3461 /* swap high and low halves */
3462#ifdef __s390x__
3463 xacc[i] += vec_permi(data_vec, data_vec, 2);
3464#else
3465 xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
3466#endif
3467 }
3468}
3469
3470XXH_FORCE_INLINE void
3471XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3472{
3473 XXH_ASSERT((((size_t)acc) & 15) == 0);
3474
3475 { xxh_u64x2* const xacc = (xxh_u64x2*) acc;
3476 const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
3477 /* constants */
3478 xxh_u64x2 const v32 = { 32, 32 };
3479 xxh_u64x2 const v47 = { 47, 47 };
3480 xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
3481 size_t i;
3482 for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
3483 /* xacc[i] ^= (xacc[i] >> 47); */
3484 xxh_u64x2 const acc_vec = xacc[i];
3485 xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
3486
3487 /* xacc[i] ^= xsecret[i]; */
3488 xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
3489 xxh_u64x2 const data_key = data_vec ^ key_vec;
3490
3491 /* xacc[i] *= XXH_PRIME32_1 */
3492 /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */
3493 xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
3494 /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */
3495 xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
3496 xacc[i] = prod_odd + (prod_even << v32);
3497 } }
3498}
3499
3500#endif
3501
3502/* scalar variants - universal */
3503
3504XXH_FORCE_INLINE void
3505XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
3506 const void* XXH_RESTRICT input,
3507 const void* XXH_RESTRICT secret)
3508{
3509 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
3510 const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */
3511 const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
3512 size_t i;
3513 XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
3514 for (i=0; i < XXH_ACC_NB; i++) {
3515 xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
3516 xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
3517 xacc[i ^ 1] += data_val; /* swap adjacent lanes */
3518 xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
3519 }
3520}
3521
3522XXH_FORCE_INLINE void
3523XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3524{
3525 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
3526 const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
3527 size_t i;
3528 XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
3529 for (i=0; i < XXH_ACC_NB; i++) {
3530 xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
3531 xxh_u64 acc64 = xacc[i];
3532 acc64 = XXH_xorshift64(acc64, 47);
3533 acc64 ^= key64;
3534 acc64 *= XXH_PRIME32_1;
3535 xacc[i] = acc64;
3536 }
3537}
3538
3539XXH_FORCE_INLINE void
3540XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
3541{
3542 /*
3543 * We need a separate pointer for the hack below,
3544 * which requires a non-const pointer.
3545 * Any decent compiler will optimize this out otherwise.
3546 */
3547 const xxh_u8* kSecretPtr = XXH3_kSecret;
3548 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
3549
3550#if defined(__clang__) && defined(__aarch64__)
3551 /*
3552 * UGLY HACK:
3553 * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
3554 * placed sequentially, in order, at the top of the unrolled loop.
3555 *
3556 * While MOVK is great for generating constants (2 cycles for a 64-bit
3557 * constant compared to 4 cycles for LDR), long MOVK chains stall the
3558 * integer pipelines:
3559 * I L S
3560 * MOVK
3561 * MOVK
3562 * MOVK
3563 * MOVK
3564 * ADD
3565 * SUB STR
3566 * STR
3567 * By forcing loads from memory (as the asm line causes Clang to assume
3568 * that XXH3_kSecretPtr has been changed), the pipelines are used more
3569 * efficiently:
3570 * I L S
3571 * LDR
3572 * ADD LDR
3573 * SUB STR
3574 * STR
3575 * XXH3_64bits_withSeed, len == 256, Snapdragon 835
3576 * without hack: 2654.4 MB/s
3577 * with hack: 3202.9 MB/s
3578 */
3579 __asm__("" : "+r" (kSecretPtr));
3580#endif
3581 /*
3582 * Note: in debug mode, this overrides the asm optimization
3583 * and Clang will emit MOVK chains again.
3584 */
3585 XXH_ASSERT(kSecretPtr == XXH3_kSecret);
3586
3587 { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
3588 int i;
3589 for (i=0; i < nbRounds; i++) {
3590 /*
3591 * The asm hack causes Clang to assume that kSecretPtr aliases with
3592 * customSecret, and on aarch64, this prevented LDP from merging two
3593 * loads together for free. Putting the loads together before the stores
3594 * properly generates LDP.
3595 */
3596 xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64;
3597 xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
3598 XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo);
3599 XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
3600 } }
3601}
3602
3603
3604typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);
3605typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
3606typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
3607
3608
3609#if (XXH_VECTOR == XXH_AVX512)
3610
3611#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
3612#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
3613#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
3614
3615#elif (XXH_VECTOR == XXH_AVX2)
3616
3617#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
3618#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
3619#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
3620
3621#elif (XXH_VECTOR == XXH_SSE2)
3622
3623#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
3624#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
3625#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
3626
3627#elif (XXH_VECTOR == XXH_NEON)
3628
3629#define XXH3_accumulate_512 XXH3_accumulate_512_neon
3630#define XXH3_scrambleAcc XXH3_scrambleAcc_neon
3631#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
3632
3633#elif (XXH_VECTOR == XXH_VSX)
3634
3635#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
3636#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
3637#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
3638
3639#else /* scalar */
3640
3641#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
3642#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
3643#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
3644
3645#endif
3646
3647
3648
3649#ifndef XXH_PREFETCH_DIST
3650# ifdef __clang__
3651# define XXH_PREFETCH_DIST 320
3652# else
3653# if (XXH_VECTOR == XXH_AVX512)
3654# define XXH_PREFETCH_DIST 512
3655# else
3656# define XXH_PREFETCH_DIST 384
3657# endif
3658# endif /* __clang__ */
3659#endif /* XXH_PREFETCH_DIST */
3660
3661/*
3662 * XXH3_accumulate()
3663 * Loops over XXH3_accumulate_512().
3664 * Assumption: nbStripes will not overflow the secret size
3665 */
3666XXH_FORCE_INLINE void
3667XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,
3668 const xxh_u8* XXH_RESTRICT input,
3669 const xxh_u8* XXH_RESTRICT secret,
3670 size_t nbStripes,
3671 XXH3_f_accumulate_512 f_acc512)
3672{
3673 size_t n;
3674 for (n = 0; n < nbStripes; n++ ) {
3675 const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
3676 XXH_PREFETCH(in + XXH_PREFETCH_DIST);
3677 f_acc512(acc,
3678 in,
3679 secret + n*XXH_SECRET_CONSUME_RATE);
3680 }
3681}
3682
3683XXH_FORCE_INLINE void
3684XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
3685 const xxh_u8* XXH_RESTRICT input, size_t len,
3686 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
3687 XXH3_f_accumulate_512 f_acc512,
3688 XXH3_f_scrambleAcc f_scramble)
3689{
3690 size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
3691 size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
3692 size_t const nb_blocks = (len - 1) / block_len;
3693
3694 size_t n;
3695
3696 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
3697
3698 for (n = 0; n < nb_blocks; n++) {
3699 XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
3700 f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
3701 }
3702
3703 /* last partial block */
3704 XXH_ASSERT(len > XXH_STRIPE_LEN);
3705 { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
3706 XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
3707 XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
3708
3709 /* last stripe */
3710 { const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
3711#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */
3712 f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
3713 } }
3714}
3715
3716XXH_FORCE_INLINE xxh_u64
3717XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
3718{
3719 return XXH3_mul128_fold64(
3720 acc[0] ^ XXH_readLE64(secret),
3721 acc[1] ^ XXH_readLE64(secret+8) );
3722}
3723
3724static XXH64_hash_t
3725XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
3726{
3727 xxh_u64 result64 = start;
3728 size_t i = 0;
3729
3730 for (i = 0; i < 4; i++) {
3731 result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
3732#if defined(__clang__) /* Clang */ \
3733 && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \
3734 && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
3735 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
3736 /*
3737 * UGLY HACK:
3738 * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
3739 * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
3740 * XXH3_64bits, len == 256, Snapdragon 835:
3741 * without hack: 2063.7 MB/s
3742 * with hack: 2560.7 MB/s
3743 */
3744 __asm__("" : "+r" (result64));
3745#endif
3746 }
3747
3748 return XXH3_avalanche(result64);
3749}
3750
3751#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
3752 XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
3753
3754XXH_FORCE_INLINE XXH64_hash_t
3755XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
3756 const void* XXH_RESTRICT secret, size_t secretSize,
3757 XXH3_f_accumulate_512 f_acc512,
3758 XXH3_f_scrambleAcc f_scramble)
3759{
3760 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
3761
3762 XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);
3763
3764 /* converge into final hash */
3765 XXH_STATIC_ASSERT(sizeof(acc) == 64);
3766 /* do not align on 8, so that the secret is different from the accumulator */
3767#define XXH_SECRET_MERGEACCS_START 11
3768 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
3769 return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
3770}
3771
3772/*
3773 * It's important for performance that XXH3_hashLong is not inlined.
3774 */
3775XXH_NO_INLINE XXH64_hash_t
3776XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
3777 XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
3778{
3779 (void)seed64;
3780 return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
3781}
3782
3783/*
3784 * It's important for performance that XXH3_hashLong is not inlined.
3785 * Since the function is not inlined, the compiler may not be able to understand that,
3786 * in some scenarios, its `secret` argument is actually a compile time constant.
3787 * This variant enforces that the compiler can detect that,
3788 * and uses this opportunity to streamline the generated code for better performance.
3789 */
3790XXH_NO_INLINE XXH64_hash_t
3791XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
3792 XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
3793{
3794 (void)seed64; (void)secret; (void)secretLen;
3795 return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
3796}
3797
3798/*
3799 * XXH3_hashLong_64b_withSeed():
3800 * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
3801 * and then use this key for long mode hashing.
3802 *
3803 * This operation is decently fast but nonetheless costs a little bit of time.
3804 * Try to avoid it whenever possible (typically when seed==0).
3805 *
3806 * It's important for performance that XXH3_hashLong is not inlined. Not sure
3807 * why (uop cache maybe?), but the difference is large and easily measurable.
3808 */
3809XXH_FORCE_INLINE XXH64_hash_t
3810XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
3811 XXH64_hash_t seed,
3812 XXH3_f_accumulate_512 f_acc512,
3813 XXH3_f_scrambleAcc f_scramble,
3814 XXH3_f_initCustomSecret f_initSec)
3815{
3816 if (seed == 0)
3817 return XXH3_hashLong_64b_internal(input, len,
3818 XXH3_kSecret, sizeof(XXH3_kSecret),
3819 f_acc512, f_scramble);
3820 { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
3821 f_initSec(secret, seed);
3822 return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
3823 f_acc512, f_scramble);
3824 }
3825}
3826
3827/*
3828 * It's important for performance that XXH3_hashLong is not inlined.
3829 */
3830XXH_NO_INLINE XXH64_hash_t
3831XXH3_hashLong_64b_withSeed(const void* input, size_t len,
3832 XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
3833{
3834 (void)secret; (void)secretLen;
3835 return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
3836 XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
3837}
3838
3839
3840typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
3841 XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
3842
3843XXH_FORCE_INLINE XXH64_hash_t
3844XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
3845 XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
3846 XXH3_hashLong64_f f_hashLong)
3847{
3848 XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
3849 /*
3850 * If an action is to be taken if `secretLen` condition is not respected,
3851 * it should be done here.
3852 * For now, it's a contract pre-condition.
3853 * Adding a check and a branch here would cost performance at every hash.
3854 * Also, note that function signature doesn't offer room to return an error.
3855 */
3856 if (len <= 16)
3857 return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
3858 if (len <= 128)
3859 return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
3860 if (len <= XXH3_MIDSIZE_MAX)
3861 return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
3862 return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
3863}
3864
3865
3866/* === Public entry point === */
3867
3868XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
3869{
3870 return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
3871}
3872
3873XXH_PUBLIC_API XXH64_hash_t
3874XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
3875{
3876 return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
3877}
3878
3879XXH_PUBLIC_API XXH64_hash_t
3880XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
3881{
3882 return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
3883}
3884
3885
3886/* === XXH3 streaming === */
3887
3888/*
3889 * Malloc's a pointer that is always aligned to align.
3890 *
3891 * This must be freed with `XXH_alignedFree()`.
3892 *
3893 * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
3894 * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
3895 * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
3896 *
3897 * This underalignment previously caused a rather obvious crash which went
3898 * completely unnoticed due to XXH3_createState() not actually being tested.
3899 * Credit to RedSpah for noticing this bug.
3900 *
3901 * The alignment is done manually: Functions like posix_memalign or _mm_malloc
3902 * are avoided: To maintain portability, we would have to write a fallback
3903 * like this anyways, and besides, testing for the existence of library
3904 * functions without relying on external build tools is impossible.
3905 *
3906 * The method is simple: Overallocate, manually align, and store the offset
3907 * to the original behind the returned pointer.
3908 *
3909 * Align must be a power of 2 and 8 <= align <= 128.
3910 */
3911static void* XXH_alignedMalloc(size_t s, size_t align)
3912{
3913 XXH_ASSERT(align <= 128 && align >= 8); /* range check */
3914 XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */
3915 XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */
3916 { /* Overallocate to make room for manual realignment and an offset byte */
3917 xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
3918 if (base != NULL) {
3919 /*
3920 * Get the offset needed to align this pointer.
3921 *
3922 * Even if the returned pointer is aligned, there will always be
3923 * at least one byte to store the offset to the original pointer.
3924 */
3925 size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
3926 /* Add the offset for the now-aligned pointer */
3927 xxh_u8* ptr = base + offset;
3928
3929 XXH_ASSERT((size_t)ptr % align == 0);
3930
3931 /* Store the offset immediately before the returned pointer. */
3932 ptr[-1] = (xxh_u8)offset;
3933 return ptr;
3934 }
3935 return NULL;
3936 }
3937}
3938/*
3939 * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
3940 * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
3941 */
3942static void XXH_alignedFree(void* p)
3943{
3944 if (p != NULL) {
3945 xxh_u8* ptr = (xxh_u8*)p;
3946 /* Get the offset byte we added in XXH_malloc. */
3947 xxh_u8 offset = ptr[-1];
3948 /* Free the original malloc'd pointer */
3949 xxh_u8* base = ptr - offset;
3950 XXH_free(base);
3951 }
3952}
3953XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
3954{
3955 XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
3956 if (state==NULL) return NULL;
3957 XXH3_INITSTATE(state);
3958 return state;
3959}
3960
3961XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
3962{
3963 XXH_alignedFree(statePtr);
3964 return XXH_OK;
3965}
3966
3967XXH_PUBLIC_API void
3968XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
3969{
3970 memcpy(dst_state, src_state, sizeof(*dst_state));
3971}
3972
3973static void
3974XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
3975 XXH64_hash_t seed,
3976 const void* secret, size_t secretSize)
3977{
3978 size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
3979 size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
3980 XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
3981 XXH_ASSERT(statePtr != NULL);
3982 /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
3983 memset((char*)statePtr + initStart, 0, initLength);
3984 statePtr->acc[0] = XXH_PRIME32_3;
3985 statePtr->acc[1] = XXH_PRIME64_1;
3986 statePtr->acc[2] = XXH_PRIME64_2;
3987 statePtr->acc[3] = XXH_PRIME64_3;
3988 statePtr->acc[4] = XXH_PRIME64_4;
3989 statePtr->acc[5] = XXH_PRIME32_2;
3990 statePtr->acc[6] = XXH_PRIME64_5;
3991 statePtr->acc[7] = XXH_PRIME32_1;
3992 statePtr->seed = seed;
3993 statePtr->extSecret = (const unsigned char*)secret;
3994 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
3995 statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
3996 statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
3997}
3998
3999XXH_PUBLIC_API XXH_errorcode
4000XXH3_64bits_reset(XXH3_state_t* statePtr)
4001{
4002 if (statePtr == NULL) return XXH_ERROR;
4003 XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
4004 return XXH_OK;
4005}
4006
4007XXH_PUBLIC_API XXH_errorcode
4008XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
4009{
4010 if (statePtr == NULL) return XXH_ERROR;
4011 XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
4012 if (secret == NULL) return XXH_ERROR;
4013 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
4014 return XXH_OK;
4015}
4016
4017XXH_PUBLIC_API XXH_errorcode
4018XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
4019{
4020 if (statePtr == NULL) return XXH_ERROR;
4021 if (seed==0) return XXH3_64bits_reset(statePtr);
4022 if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
4023 XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
4024 return XXH_OK;
4025}
4026
4027/* Note : when XXH3_consumeStripes() is invoked,
4028 * there must be a guarantee that at least one more byte must be consumed from input
4029 * so that the function can blindly consume all stripes using the "normal" secret segment */
4030XXH_FORCE_INLINE void
4031XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
4032 size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
4033 const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
4034 const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
4035 XXH3_f_accumulate_512 f_acc512,
4036 XXH3_f_scrambleAcc f_scramble)
4037{
4038 XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */
4039 XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
4040 if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
4041 /* need a scrambling operation */
4042 size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
4043 size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
4044 XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
4045 f_scramble(acc, secret + secretLimit);
4046 XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
4047 *nbStripesSoFarPtr = nbStripesAfterBlock;
4048 } else {
4049 XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
4050 *nbStripesSoFarPtr += nbStripes;
4051 }
4052}
4053
4054/*
4055 * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
4056 */
4057XXH_FORCE_INLINE XXH_errorcode
4058XXH3_update(XXH3_state_t* state,
4059 const xxh_u8* input, size_t len,
4060 XXH3_f_accumulate_512 f_acc512,
4061 XXH3_f_scrambleAcc f_scramble)
4062{
4063 if (input==NULL)
4064#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
4065 return XXH_OK;
4066#else
4067 return XXH_ERROR;
4068#endif
4069
4070 { const xxh_u8* const bEnd = input + len;
4071 const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
4072
4073 state->totalLen += len;
4074
4075 if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
4076 XXH_memcpy(state->buffer + state->bufferedSize, input, len);
4077 state->bufferedSize += (XXH32_hash_t)len;
4078 return XXH_OK;
4079 }
4080 /* total input is now > XXH3_INTERNALBUFFER_SIZE */
4081
4082 #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
4083 XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */
4084
4085 /*
4086 * Internal buffer is partially filled (always, except at beginning)
4087 * Complete it, then consume it.
4088 */
4089 if (state->bufferedSize) {
4090 size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
4091 XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
4092 input += loadSize;
4093 XXH3_consumeStripes(state->acc,
4094 &state->nbStripesSoFar, state->nbStripesPerBlock,
4095 state->buffer, XXH3_INTERNALBUFFER_STRIPES,
4096 secret, state->secretLimit,
4097 f_acc512, f_scramble);
4098 state->bufferedSize = 0;
4099 }
4100 XXH_ASSERT(input < bEnd);
4101
4102 /* Consume input by a multiple of internal buffer size */
4103 if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) {
4104 const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
4105 do {
4106 XXH3_consumeStripes(state->acc,
4107 &state->nbStripesSoFar, state->nbStripesPerBlock,
4108 input, XXH3_INTERNALBUFFER_STRIPES,
4109 secret, state->secretLimit,
4110 f_acc512, f_scramble);
4111 input += XXH3_INTERNALBUFFER_SIZE;
4112 } while (input<limit);
4113 /* for last partial stripe */
4114 memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
4115 }
4116 XXH_ASSERT(input < bEnd);
4117
4118 /* Some remaining input (always) : buffer it */
4119 XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
4120 state->bufferedSize = (XXH32_hash_t)(bEnd-input);
4121 }
4122
4123 return XXH_OK;
4124}
4125
4126XXH_PUBLIC_API XXH_errorcode
4127XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
4128{
4129 return XXH3_update(state, (const xxh_u8*)input, len,
4130 XXH3_accumulate_512, XXH3_scrambleAcc);
4131}
4132
4133
4134XXH_FORCE_INLINE void
4135XXH3_digest_long (XXH64_hash_t* acc,
4136 const XXH3_state_t* state,
4137 const unsigned char* secret)
4138{
4139 /*
4140 * Digest on a local copy. This way, the state remains unaltered, and it can
4141 * continue ingesting more input afterwards.
4142 */
4143 memcpy(acc, state->acc, sizeof(state->acc));
4144 if (state->bufferedSize >= XXH_STRIPE_LEN) {
4145 size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
4146 size_t nbStripesSoFar = state->nbStripesSoFar;
4147 XXH3_consumeStripes(acc,
4148 &nbStripesSoFar, state->nbStripesPerBlock,
4149 state->buffer, nbStripes,
4150 secret, state->secretLimit,
4151 XXH3_accumulate_512, XXH3_scrambleAcc);
4152 /* last stripe */
4153 XXH3_accumulate_512(acc,
4154 state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
4155 secret + state->secretLimit - XXH_SECRET_LASTACC_START);
4156 } else { /* bufferedSize < XXH_STRIPE_LEN */
4157 xxh_u8 lastStripe[XXH_STRIPE_LEN];
4158 size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
4159 XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
4160 memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
4161 memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
4162 XXH3_accumulate_512(acc,
4163 lastStripe,
4164 secret + state->secretLimit - XXH_SECRET_LASTACC_START);
4165 }
4166}
4167
4168XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
4169{
4170 const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
4171 if (state->totalLen > XXH3_MIDSIZE_MAX) {
4172 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
4173 XXH3_digest_long(acc, state, secret);
4174 return XXH3_mergeAccs(acc,
4175 secret + XXH_SECRET_MERGEACCS_START,
4176 (xxh_u64)state->totalLen * XXH_PRIME64_1);
4177 }
4178 /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
4179 if (state->seed)
4180 return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
4181 return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
4182 secret, state->secretLimit + XXH_STRIPE_LEN);
4183}
4184
4185
4186#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
4187
4188XXH_PUBLIC_API void
4189XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)
4190{
4191 XXH_ASSERT(secretBuffer != NULL);
4192 if (customSeedSize == 0) {
4193 memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
4194 return;
4195 }
4196 XXH_ASSERT(customSeed != NULL);
4197
4198 { size_t const segmentSize = sizeof(XXH128_hash_t);
4199 size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
4200 XXH128_canonical_t scrambler;
4201 XXH64_hash_t seeds[12];
4202 size_t segnb;
4203 XXH_ASSERT(nbSegments == 12);
4204 XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
4205 XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
4206
4207 /*
4208 * Copy customSeed to seeds[], truncating or repeating as necessary.
4209 */
4210 { size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
4211 size_t filled = toFill;
4212 memcpy(seeds, customSeed, toFill);
4213 while (filled < sizeof(seeds)) {
4214 toFill = XXH_MIN(filled, sizeof(seeds) - filled);
4215 memcpy((char*)seeds + filled, seeds, toFill);
4216 filled += toFill;
4217 } }
4218
4219 /* generate secret */
4220 memcpy(secretBuffer, &scrambler, sizeof(scrambler));
4221 for (segnb=1; segnb < nbSegments; segnb++) {
4222 size_t const segmentStart = segnb * segmentSize;
4223 XXH128_canonical_t segment;
4224 XXH128_canonicalFromHash(&segment,
4225 XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );
4226 memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));
4227 } }
4228}
4229
4230
4231/* ==========================================
4232 * XXH3 128 bits (a.k.a XXH128)
4233 * ==========================================
4234 * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
4235 * even without counting the significantly larger output size.
4236 *
4237 * For example, extra steps are taken to avoid the seed-dependent collisions
4238 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
4239 *
4240 * This strength naturally comes at the cost of some speed, especially on short
4241 * lengths. Note that longer hashes are about as fast as the 64-bit version
4242 * due to it using only a slight modification of the 64-bit loop.
4243 *
4244 * XXH128 is also more oriented towards 64-bit machines. It is still extremely
4245 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
4246 */
4247
4248XXH_FORCE_INLINE XXH128_hash_t
4249XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
4250{
4251 /* A doubled version of 1to3_64b with different constants. */
4252 XXH_ASSERT(input != NULL);
4253 XXH_ASSERT(1 <= len && len <= 3);
4254 XXH_ASSERT(secret != NULL);
4255 /*
4256 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
4257 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
4258 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
4259 */
4260 { xxh_u8 const c1 = input[0];
4261 xxh_u8 const c2 = input[len >> 1];
4262 xxh_u8 const c3 = input[len - 1];
4263 xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
4264 | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
4265 xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
4266 xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
4267 xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
4268 xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
4269 xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
4270 XXH128_hash_t h128;
4271 h128.low64 = XXH64_avalanche(keyed_lo);
4272 h128.high64 = XXH64_avalanche(keyed_hi);
4273 return h128;
4274 }
4275}
4276
4277XXH_FORCE_INLINE XXH128_hash_t
4278XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
4279{
4280 XXH_ASSERT(input != NULL);
4281 XXH_ASSERT(secret != NULL);
4282 XXH_ASSERT(4 <= len && len <= 8);
4283 seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
4284 { xxh_u32 const input_lo = XXH_readLE32(input);
4285 xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
4286 xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
4287 xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
4288 xxh_u64 const keyed = input_64 ^ bitflip;
4289
4290 /* Shift len to the left to ensure it is even, this avoids even multiplies. */
4291 XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
4292
4293 m128.high64 += (m128.low64 << 1);
4294 m128.low64 ^= (m128.high64 >> 3);
4295
4296 m128.low64 = XXH_xorshift64(m128.low64, 35);
4297 m128.low64 *= 0x9FB21C651E98DF25ULL;
4298 m128.low64 = XXH_xorshift64(m128.low64, 28);
4299 m128.high64 = XXH3_avalanche(m128.high64);
4300 return m128;
4301 }
4302}
4303
4304XXH_FORCE_INLINE XXH128_hash_t
4305XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
4306{
4307 XXH_ASSERT(input != NULL);
4308 XXH_ASSERT(secret != NULL);
4309 XXH_ASSERT(9 <= len && len <= 16);
4310 { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
4311 xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
4312 xxh_u64 const input_lo = XXH_readLE64(input);
4313 xxh_u64 input_hi = XXH_readLE64(input + len - 8);
4314 XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
4315 /*
4316 * Put len in the middle of m128 to ensure that the length gets mixed to
4317 * both the low and high bits in the 128x64 multiply below.
4318 */
4319 m128.low64 += (xxh_u64)(len - 1) << 54;
4320 input_hi ^= bitfliph;
4321 /*
4322 * Add the high 32 bits of input_hi to the high 32 bits of m128, then
4323 * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
4324 * the high 64 bits of m128.
4325 *
4326 * The best approach to this operation is different on 32-bit and 64-bit.
4327 */
4328 if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
4329 /*
4330 * 32-bit optimized version, which is more readable.
4331 *
4332 * On 32-bit, it removes an ADC and delays a dependency between the two
4333 * halves of m128.high64, but it generates an extra mask on 64-bit.
4334 */
4335 m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
4336 } else {
4337 /*
4338 * 64-bit optimized (albeit more confusing) version.
4339 *
4340 * Uses some properties of addition and multiplication to remove the mask:
4341 *
4342 * Let:
4343 * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
4344 * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
4345 * c = XXH_PRIME32_2
4346 *
4347 * a + (b * c)
4348 * Inverse Property: x + y - x == y
4349 * a + (b * (1 + c - 1))
4350 * Distributive Property: x * (y + z) == (x * y) + (x * z)
4351 * a + (b * 1) + (b * (c - 1))
4352 * Identity Property: x * 1 == x
4353 * a + b + (b * (c - 1))
4354 *
4355 * Substitute a, b, and c:
4356 * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
4357 *
4358 * Since input_hi.hi + input_hi.lo == input_hi, we get this:
4359 * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
4360 */
4361 m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
4362 }
4363 /* m128 ^= XXH_swap64(m128 >> 64); */
4364 m128.low64 ^= XXH_swap64(m128.high64);
4365
4366 { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
4367 XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
4368 h128.high64 += m128.high64 * XXH_PRIME64_2;
4369
4370 h128.low64 = XXH3_avalanche(h128.low64);
4371 h128.high64 = XXH3_avalanche(h128.high64);
4372 return h128;
4373 } }
4374}
4375
4376/*
4377 * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
4378 */
4379XXH_FORCE_INLINE XXH128_hash_t
4380XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
4381{
4382 XXH_ASSERT(len <= 16);
4383 { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
4384 if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
4385 if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
4386 { XXH128_hash_t h128;
4387 xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
4388 xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
4389 h128.low64 = XXH64_avalanche(seed ^ bitflipl);
4390 h128.high64 = XXH64_avalanche( seed ^ bitfliph);
4391 return h128;
4392 } }
4393}
4394
4395/*
4396 * A bit slower than XXH3_mix16B, but handles multiply by zero better.
4397 */
4398XXH_FORCE_INLINE XXH128_hash_t
4399XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
4400 const xxh_u8* secret, XXH64_hash_t seed)
4401{
4402 acc.low64 += XXH3_mix16B (input_1, secret+0, seed);
4403 acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
4404 acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
4405 acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
4406 return acc;
4407}
4408
4409
4410XXH_FORCE_INLINE XXH128_hash_t
4411XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
4412 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
4413 XXH64_hash_t seed)
4414{
4415 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
4416 XXH_ASSERT(16 < len && len <= 128);
4417
4418 { XXH128_hash_t acc;
4419 acc.low64 = len * XXH_PRIME64_1;
4420 acc.high64 = 0;
4421 if (len > 32) {
4422 if (len > 64) {
4423 if (len > 96) {
4424 acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
4425 }
4426 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
4427 }
4428 acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
4429 }
4430 acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
4431 { XXH128_hash_t h128;
4432 h128.low64 = acc.low64 + acc.high64;
4433 h128.high64 = (acc.low64 * XXH_PRIME64_1)
4434 + (acc.high64 * XXH_PRIME64_4)
4435 + ((len - seed) * XXH_PRIME64_2);
4436 h128.low64 = XXH3_avalanche(h128.low64);
4437 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
4438 return h128;
4439 }
4440 }
4441}
4442
4443XXH_NO_INLINE XXH128_hash_t
4444XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
4445 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
4446 XXH64_hash_t seed)
4447{
4448 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
4449 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
4450
4451 { XXH128_hash_t acc;
4452 int const nbRounds = (int)len / 32;
4453 int i;
4454 acc.low64 = len * XXH_PRIME64_1;
4455 acc.high64 = 0;
4456 for (i=0; i<4; i++) {
4457 acc = XXH128_mix32B(acc,
4458 input + (32 * i),
4459 input + (32 * i) + 16,
4460 secret + (32 * i),
4461 seed);
4462 }
4463 acc.low64 = XXH3_avalanche(acc.low64);
4464 acc.high64 = XXH3_avalanche(acc.high64);
4465 XXH_ASSERT(nbRounds >= 4);
4466 for (i=4 ; i < nbRounds; i++) {
4467 acc = XXH128_mix32B(acc,
4468 input + (32 * i),
4469 input + (32 * i) + 16,
4470 secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
4471 seed);
4472 }
4473 /* last bytes */
4474 acc = XXH128_mix32B(acc,
4475 input + len - 16,
4476 input + len - 32,
4477 secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
4478 0ULL - seed);
4479
4480 { XXH128_hash_t h128;
4481 h128.low64 = acc.low64 + acc.high64;
4482 h128.high64 = (acc.low64 * XXH_PRIME64_1)
4483 + (acc.high64 * XXH_PRIME64_4)
4484 + ((len - seed) * XXH_PRIME64_2);
4485 h128.low64 = XXH3_avalanche(h128.low64);
4486 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
4487 return h128;
4488 }
4489 }
4490}
4491
4492XXH_FORCE_INLINE XXH128_hash_t
4493XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
4494 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
4495 XXH3_f_accumulate_512 f_acc512,
4496 XXH3_f_scrambleAcc f_scramble)
4497{
4498 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
4499
4500 XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
4501
4502 /* converge into final hash */
4503 XXH_STATIC_ASSERT(sizeof(acc) == 64);
4504 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
4505 { XXH128_hash_t h128;
4506 h128.low64 = XXH3_mergeAccs(acc,
4507 secret + XXH_SECRET_MERGEACCS_START,
4508 (xxh_u64)len * XXH_PRIME64_1);
4509 h128.high64 = XXH3_mergeAccs(acc,
4510 secret + secretSize
4511 - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
4512 ~((xxh_u64)len * XXH_PRIME64_2));
4513 return h128;
4514 }
4515}
4516
4517/*
4518 * It's important for performance that XXH3_hashLong is not inlined.
4519 */
4520XXH_NO_INLINE XXH128_hash_t
4521XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
4522 XXH64_hash_t seed64,
4523 const void* XXH_RESTRICT secret, size_t secretLen)
4524{
4525 (void)seed64; (void)secret; (void)secretLen;
4526 return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
4527 XXH3_accumulate_512, XXH3_scrambleAcc);
4528}
4529
4530/*
4531 * It's important for performance that XXH3_hashLong is not inlined.
4532 */
4533XXH_NO_INLINE XXH128_hash_t
4534XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
4535 XXH64_hash_t seed64,
4536 const void* XXH_RESTRICT secret, size_t secretLen)
4537{
4538 (void)seed64;
4539 return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
4540 XXH3_accumulate_512, XXH3_scrambleAcc);
4541}
4542
4543XXH_FORCE_INLINE XXH128_hash_t
4544XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
4545 XXH64_hash_t seed64,
4546 XXH3_f_accumulate_512 f_acc512,
4547 XXH3_f_scrambleAcc f_scramble,
4548 XXH3_f_initCustomSecret f_initSec)
4549{
4550 if (seed64 == 0)
4551 return XXH3_hashLong_128b_internal(input, len,
4552 XXH3_kSecret, sizeof(XXH3_kSecret),
4553 f_acc512, f_scramble);
4554 { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
4555 f_initSec(secret, seed64);
4556 return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
4557 f_acc512, f_scramble);
4558 }
4559}
4560
4561/*
4562 * It's important for performance that XXH3_hashLong is not inlined.
4563 */
4564XXH_NO_INLINE XXH128_hash_t
4565XXH3_hashLong_128b_withSeed(const void* input, size_t len,
4566 XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
4567{
4568 (void)secret; (void)secretLen;
4569 return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
4570 XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
4571}
4572
4573typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
4574 XXH64_hash_t, const void* XXH_RESTRICT, size_t);
4575
4576XXH_FORCE_INLINE XXH128_hash_t
4577XXH3_128bits_internal(const void* input, size_t len,
4578 XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
4579 XXH3_hashLong128_f f_hl128)
4580{
4581 XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
4582 /*
4583 * If an action is to be taken if `secret` conditions are not respected,
4584 * it should be done here.
4585 * For now, it's a contract pre-condition.
4586 * Adding a check and a branch here would cost performance at every hash.
4587 */
4588 if (len <= 16)
4589 return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
4590 if (len <= 128)
4591 return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
4592 if (len <= XXH3_MIDSIZE_MAX)
4593 return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
4594 return f_hl128(input, len, seed64, secret, secretLen);
4595}
4596
4597
4598/* === Public XXH128 API === */
4599
4600XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
4601{
4602 return XXH3_128bits_internal(input, len, 0,
4603 XXH3_kSecret, sizeof(XXH3_kSecret),
4604 XXH3_hashLong_128b_default);
4605}
4606
4607XXH_PUBLIC_API XXH128_hash_t
4608XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
4609{
4610 return XXH3_128bits_internal(input, len, 0,
4611 (const xxh_u8*)secret, secretSize,
4612 XXH3_hashLong_128b_withSecret);
4613}
4614
4615XXH_PUBLIC_API XXH128_hash_t
4616XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
4617{
4618 return XXH3_128bits_internal(input, len, seed,
4619 XXH3_kSecret, sizeof(XXH3_kSecret),
4620 XXH3_hashLong_128b_withSeed);
4621}
4622
4623XXH_PUBLIC_API XXH128_hash_t
4624XXH128(const void* input, size_t len, XXH64_hash_t seed)
4625{
4626 return XXH3_128bits_withSeed(input, len, seed);
4627}
4628
4629
4630/* === XXH3 128-bit streaming === */
4631
4632/*
4633 * All the functions are actually the same as for 64-bit streaming variant.
4634 * The only difference is the finalizatiom routine.
4635 */
4636
4637static void
4638XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
4639 XXH64_hash_t seed,
4640 const void* secret, size_t secretSize)
4641{
4642 XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
4643}
4644
4645XXH_PUBLIC_API XXH_errorcode
4646XXH3_128bits_reset(XXH3_state_t* statePtr)
4647{
4648 if (statePtr == NULL) return XXH_ERROR;
4649 XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
4650 return XXH_OK;
4651}
4652
4653XXH_PUBLIC_API XXH_errorcode
4654XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
4655{
4656 if (statePtr == NULL) return XXH_ERROR;
4657 XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
4658 if (secret == NULL) return XXH_ERROR;
4659 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
4660 return XXH_OK;
4661}
4662
4663XXH_PUBLIC_API XXH_errorcode
4664XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
4665{
4666 if (statePtr == NULL) return XXH_ERROR;
4667 if (seed==0) return XXH3_128bits_reset(statePtr);
4668 if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
4669 XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
4670 return XXH_OK;
4671}
4672
4673XXH_PUBLIC_API XXH_errorcode
4674XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
4675{
4676 return XXH3_update(state, (const xxh_u8*)input, len,
4677 XXH3_accumulate_512, XXH3_scrambleAcc);
4678}
4679
4680XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
4681{
4682 const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
4683 if (state->totalLen > XXH3_MIDSIZE_MAX) {
4684 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
4685 XXH3_digest_long(acc, state, secret);
4686 XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
4687 { XXH128_hash_t h128;
4688 h128.low64 = XXH3_mergeAccs(acc,
4689 secret + XXH_SECRET_MERGEACCS_START,
4690 (xxh_u64)state->totalLen * XXH_PRIME64_1);
4691 h128.high64 = XXH3_mergeAccs(acc,
4692 secret + state->secretLimit + XXH_STRIPE_LEN
4693 - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
4694 ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
4695 return h128;
4696 }
4697 }
4698 /* len <= XXH3_MIDSIZE_MAX : short code */
4699 if (state->seed)
4700 return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
4701 return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
4702 secret, state->secretLimit + XXH_STRIPE_LEN);
4703}
4704
4705/* 128-bit utility functions */
4706
4707#include <string.h> /* memcmp, memcpy */
4708
4709/* return : 1 is equal, 0 if different */
4710XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
4711{
4712 /* note : XXH128_hash_t is compact, it has no padding byte */
4713 return !(memcmp(&h1, &h2, sizeof(h1)));
4714}
4715
4716/* This prototype is compatible with stdlib's qsort().
4717 * return : >0 if *h128_1 > *h128_2
4718 * <0 if *h128_1 < *h128_2
4719 * =0 if *h128_1 == *h128_2 */
4720XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
4721{
4722 XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
4723 XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
4724 int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
4725 /* note : bets that, in most cases, hash values are different */
4726 if (hcmp) return hcmp;
4727 return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
4728}
4729
4730
4731/*====== Canonical representation ======*/
4732XXH_PUBLIC_API void
4733XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
4734{
4735 XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
4736 if (XXH_CPU_LITTLE_ENDIAN) {
4737 hash.high64 = XXH_swap64(hash.high64);
4738 hash.low64 = XXH_swap64(hash.low64);
4739 }
4740 memcpy(dst, &hash.high64, sizeof(hash.high64));
4741 memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
4742}
4743
4744XXH_PUBLIC_API XXH128_hash_t
4745XXH128_hashFromCanonical(const XXH128_canonical_t* src)
4746{
4747 XXH128_hash_t h;
4748 h.high64 = XXH_readBE64(src);
4749 h.low64 = XXH_readBE64(src->digest + 8);
4750 return h;
4751}
4752
4753/* Pop our optimization override from above */
4754#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
4755 && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
4756 && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
4757# pragma GCC pop_options
4758#endif
4759
4760#endif /* XXH_NO_LONG_LONG */
4761
4762
4763#endif /* XXH_IMPLEMENTATION */
Willy Tarreaub5684e02015-04-27 11:59:40 +02004764
4765
4766#if defined (__cplusplus)
4767}
4768#endif