Blame - include/import/xxhash.h - haproxy

blob: 2d56d23c5d0beac4c1a4cab22da660674d0e0080 [file] [log] [blame]

Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1	/*
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	2	* xxHash - Extremely Fast Hash algorithm
				3	* Header File
				4	* Copyright (C) 2012-2020 Yann Collet
				5	*
				6	* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
				7	*
				8	* Redistribution and use in source and binary forms, with or without
				9	* modification, are permitted provided that the following conditions are
				10	* met:
				11	*
				12	* * Redistributions of source code must retain the above copyright
				13	* notice, this list of conditions and the following disclaimer.
				14	* * Redistributions in binary form must reproduce the above
				15	* copyright notice, this list of conditions and the following disclaimer
				16	* in the documentation and/or other materials provided with the
				17	* distribution.
				18	*
				19	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30	*
				31	* You can contact the author at:
				32	* - xxHash homepage: https://www.xxhash.com
				33	* - xxHash source repository: https://github.com/Cyan4973/xxHash
				34	*/
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	35
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	36	/* TODO: update */
				37	/* Notice extracted from xxHash homepage:
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	38
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	39	xxHash is an extremely fast hash algorithm, running at RAM speed limits.
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	40	It also successfully passes all tests from the SMHasher suite.
				41
				42	Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
				43
				44	Name Speed Q.Score Author
				45	xxHash 5.4 GB/s 10
				46	CrapWow 3.2 GB/s 2 Andrew
				47	MumurHash 3a 2.7 GB/s 10 Austin Appleby
				48	SpookyHash 2.0 GB/s 10 Bob Jenkins
				49	SBox 1.4 GB/s 9 Bret Mulvey
				50	Lookup3 1.2 GB/s 9 Bob Jenkins
				51	SuperFastHash 1.2 GB/s 1 Paul Hsieh
				52	CityHash64 1.05 GB/s 10 Pike & Alakuijala
				53	FNV 0.55 GB/s 5 Fowler, Noll, Vo
				54	CRC32 0.43 GB/s 9
				55	MD5-32 0.33 GB/s 10 Ronald L. Rivest
				56	SHA1-32 0.28 GB/s 10
				57
				58	Q.Score is a measure of quality of the hash function.
				59	It depends on successfully passing SMHasher test set.
				60	10 is a perfect score.
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	61
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	62	Note: SMHasher's CRC32 implementation is not the fastest one.
				63	Other speed-oriented implementations can be faster,
				64	especially in combination with PCLMUL instruction:
				65	https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
				66
				67	A 64-bit version, named XXH64, is available since r35.
				68	It offers much better speed, but for 64-bit applications only.
				69	Name Speed on 64 bits Speed on 32 bits
				70	XXH64 13.8 GB/s 1.9 GB/s
				71	XXH32 6.8 GB/s 6.0 GB/s
				72	*/
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	73
				74	#if defined (__cplusplus)
				75	extern "C" {
				76	#endif
				77
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	78	/* ****************************
				79	* INLINE mode
				80	******************************/
				81	/*!
				82	* XXH_INLINE_ALL (and XXH_PRIVATE_API)
				83	* Use these build macros to inline xxhash into the target unit.
				84	* Inlining improves performance on small inputs, especially when the length is
				85	* expressed as a compile-time constant:
				86	*
				87	* https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
				88	*
				89	* It also keeps xxHash symbols private to the unit, so they are not exported.
				90	*
				91	* Usage:
				92	* #define XXH_INLINE_ALL
				93	* #include "xxhash.h"
				94	*
				95	* Do not compile and link xxhash.o as a separate object, as it is not useful.
				96	*/
				97	#if (defined(XXH_INLINE_ALL) \|\| defined(XXH_PRIVATE_API)) \
				98	&& !defined(XXH_INLINE_ALL_31684351384)
				99	/* this section should be traversed only once */
				100	# define XXH_INLINE_ALL_31684351384
				101	/* give access to the advanced API, required to compile implementations */
				102	# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */
				103	# define XXH_STATIC_LINKING_ONLY
				104	/* make all functions private */
				105	# undef XXH_PUBLIC_API
				106	# if defined(__GNUC__)
				107	# define XXH_PUBLIC_API static __inline __attribute__((unused))
				108	# elif defined (__cplusplus) \|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
				109	# define XXH_PUBLIC_API static inline
				110	# elif defined(_MSC_VER)
				111	# define XXH_PUBLIC_API static __inline
				112	# else
				113	/* note: this version may generate warnings for unused static functions */
				114	# define XXH_PUBLIC_API static
				115	# endif
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	116
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	117	/*
				118	* This part deals with the special case where a unit wants to inline xxHash,
				119	* but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
				120	* as part of some previously included *.h header file.
				121	* Without further action, the new include would just be ignored,
				122	* and functions would effectively _not_ be inlined (silent failure).
				123	* The following macros solve this situation by prefixing all inlined names,
				124	* avoiding naming collision with previous inclusions.
				125	*/
				126	# ifdef XXH_NAMESPACE
				127	# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
				128	/*
				129	* Note: Alternative: #undef all symbols (it's a pretty large list).
				130	* Without #error: it compiles, but functions are actually not inlined.
				131	*/
				132	# endif
				133	# define XXH_NAMESPACE XXH_INLINE_
				134	/*
				135	* Some identifiers (enums, type names) are not symbols, but they must
				136	* still be renamed to avoid redeclaration.
				137	* Alternative solution: do not redeclare them.
				138	* However, this requires some #ifdefs, and is a more dispersed action.
				139	* Meanwhile, renaming can be achieved in a single block
				140	*/
				141	# define XXH_IPREF(Id) XXH_INLINE_ ## Id
				142	# define XXH_OK XXH_IPREF(XXH_OK)
				143	# define XXH_ERROR XXH_IPREF(XXH_ERROR)
				144	# define XXH_errorcode XXH_IPREF(XXH_errorcode)
				145	# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)
				146	# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)
				147	# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
				148	# define XXH32_state_s XXH_IPREF(XXH32_state_s)
				149	# define XXH32_state_t XXH_IPREF(XXH32_state_t)
				150	# define XXH64_state_s XXH_IPREF(XXH64_state_s)
				151	# define XXH64_state_t XXH_IPREF(XXH64_state_t)
				152	# define XXH3_state_s XXH_IPREF(XXH3_state_s)
				153	# define XXH3_state_t XXH_IPREF(XXH3_state_t)
				154	# define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
				155	/* Ensure the header is parsed again, even if it was previously included */
				156	# undef XXHASH_H_5627135585666179
				157	# undef XXHASH_H_STATIC_13879238742
				158	#endif /* XXH_INLINE_ALL \|\| XXH_PRIVATE_API */
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	159
				160
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	161
				162	/* ****************************************************************
				163	* Stable API
				164	*****************************************************************/
				165	#ifndef XXHASH_H_5627135585666179
				166	#define XXHASH_H_5627135585666179 1
				167
				168	/* specific declaration modes for Windows */
				169	#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
				170	# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) \|\| defined(XXH_EXPORT))
				171	# ifdef XXH_EXPORT
				172	# define XXH_PUBLIC_API __declspec(dllexport)
				173	# elif XXH_IMPORT
				174	# define XXH_PUBLIC_API __declspec(dllimport)
				175	# endif
				176	# else
				177	# define XXH_PUBLIC_API /* do nothing */
				178	# endif
				179	#endif
				180
				181	/*!
				182	* XXH_NAMESPACE, aka Namespace Emulation:
				183	*
				184	* If you want to include _and expose_ xxHash functions from within your own
				185	* library, but also want to avoid symbol collisions with other libraries which
				186	* may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
				187	* any public symbol from xxhash library with the value of XXH_NAMESPACE
				188	* (therefore, avoid empty or numeric values).
				189	*
				190	* Note that no change is required within the calling program as long as it
				191	* includes `xxhash.h`: Regular symbol names will be automatically translated
				192	* by this header.
				193	*/
				194	#ifdef XXH_NAMESPACE
				195	# define XXH_CAT(A,B) A##B
				196	# define XXH_NAME2(A,B) XXH_CAT(A,B)
				197	# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
				198	/* XXH32 */
				199	# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
				200	# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
				201	# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
				202	# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
				203	# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
				204	# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
				205	# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
				206	# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
				207	# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
				208	/* XXH64 */
				209	# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
				210	# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
				211	# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
				212	# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
				213	# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
				214	# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
				215	# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
				216	# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
				217	# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
				218	/* XXH3_64bits */
				219	# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
				220	# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
				221	# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
				222	# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
				223	# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
				224	# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
				225	# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
				226	# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
				227	# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
				228	# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
				229	# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
				230	# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
				231	/* XXH3_128bits */
				232	# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
				233	# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
				234	# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
				235	# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
				236	# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
				237	# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
				238	# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
				239	# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
				240	# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
				241	# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
				242	# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
				243	# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
				244	# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
				245	#endif
				246
				247
				248	/* *************************************
				249	* Version
				250	***************************************/
				251	#define XXH_VERSION_MAJOR 0
				252	#define XXH_VERSION_MINOR 8
				253	#define XXH_VERSION_RELEASE 0
				254	#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR 100100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
				255	XXH_PUBLIC_API unsigned XXH_versionNumber (void);
				256
				257
				258	/* ****************************
				259	* Definitions
				260	******************************/
				261	#include <stddef.h> /* size_t */
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	262	typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
				263
				264
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	265	/-*********************************************************************
				266	* 32-bit hash
				267	************************************************************************/
				268	#if !defined (__VMS) \
				269	&& (defined (__cplusplus) \
				270	\|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
				271	# include <stdint.h>
				272	typedef uint32_t XXH32_hash_t;
				273	#else
				274	# include <limits.h>
				275	# if UINT_MAX == 0xFFFFFFFFUL
				276	typedef unsigned int XXH32_hash_t;
				277	# else
				278	# if ULONG_MAX == 0xFFFFFFFFUL
				279	typedef unsigned long XXH32_hash_t;
				280	# else
				281	# error "unsupported platform: need a 32-bit type"
				282	# endif
				283	# endif
				284	#endif
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	285
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	286	/*!
				287	* XXH32():
				288	* Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
				289	* The memory between input & input+length must be valid (allocated and read-accessible).
				290	* "seed" can be used to alter the result predictably.
				291	* Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
				292	*
				293	* Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
				294	* and offers true 64/128 bit hash results. It provides a superior level of
				295	* dispersion, and greatly reduces the risks of collisions.
				296	*/
				297	XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	298
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	299	/***** Streaming *****/
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	300
				301	/*
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	302	* Streaming functions generate the xxHash value from an incrememtal input.
				303	* This method is slower than single-call functions, due to state management.
				304	* For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
				305	*
				306	* An XXH state must first be allocated using `XXH*_createState()`.
				307	*
				308	* Start a new hash by initializing the state with a seed using `XXH*_reset()`.
				309	*
				310	* Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
				311	*
				312	* The function returns an error code, with 0 meaning OK, and any other value
				313	* meaning there is an error.
				314	*
				315	* Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
				316	* This function returns the nn-bits hash as an int or long long.
				317	*
				318	* It's still possible to continue inserting input into the hash state after a
				319	* digest, and generate new hash values later on by invoking `XXH*_digest()`.
				320	*
				321	* When done, release the state using `XXH*_freeState()`.
				322	*/
				323
				324	typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */
				325	XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
				326	XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);
				327	XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
				328
				329	XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed);
				330	XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
				331	XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
				332
				333	/***** Canonical representation *****/
				334
				335	/*
				336	* The default return values from XXH functions are unsigned 32 and 64 bit
				337	* integers.
				338	* This the simplest and fastest format for further post-processing.
				339	*
				340	* However, this leaves open the question of what is the order on the byte level,
				341	* since little and big endian conventions will store the same number differently.
				342	*
				343	* The canonical representation settles this issue by mandating big-endian
				344	* convention, the same convention as human-readable numbers (large digits first).
				345	*
				346	* When writing hash values to storage, sending them over a network, or printing
				347	* them, it's highly recommended to use the canonical representation to ensure
				348	* portability across a wider range of systems, present and future.
				349	*
				350	* The following functions allow transformation of hash values to and from
				351	* canonical format.
				352	*/
				353
				354	typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
				355	XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
				356	XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
				357
				358
				359	#ifndef XXH_NO_LONG_LONG
				360	/-*********************************************************************
				361	* 64-bit hash
				362	************************************************************************/
				363	#if !defined (__VMS) \
				364	&& (defined (__cplusplus) \
				365	\|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
				366	# include <stdint.h>
				367	typedef uint64_t XXH64_hash_t;
				368	#else
				369	/* the following type must have a width of 64-bit */
				370	typedef unsigned long long XXH64_hash_t;
				371	#endif
				372
				373	/*!
				374	* XXH64():
				375	* Returns the 64-bit hash of sequence of length @length stored at memory
				376	* address @input.
				377	* @seed can be used to alter the result predictably.
				378	*
				379	* This function usually runs faster on 64-bit systems, but slower on 32-bit
				380	* systems (see benchmark).
				381	*
				382	* Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
				383	* and offers true 64/128 bit hash results. It provides a superior level of
				384	* dispersion, and greatly reduces the risks of collisions.
				385	*/
				386	XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
				387
				388	/***** Streaming *****/
				389	typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
				390	XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
				391	XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
				392	XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
				393
				394	XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);
				395	XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
				396	XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
				397
				398	/***** Canonical representation *****/
				399	typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
				400	XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
				401	XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
				402
				403
				404	/-*********************************************************************
				405	* XXH3 64-bit variant
				406	************************************************************************/
				407
				408	/* ************************************************************************
				409	* XXH3 is a new hash algorithm featuring:
				410	* - Improved speed for both small and large inputs
				411	* - True 64-bit and 128-bit outputs
				412	* - SIMD acceleration
				413	* - Improved 32-bit viability
				414	*
				415	* Speed analysis methodology is explained here:
				416	*
				417	* https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
				418	*
				419	* In general, expect XXH3 to run about ~2x faster on large inputs and >3x
				420	* faster on small ones compared to XXH64, though exact differences depend on
				421	* the platform.
				422	*
				423	* The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
				424	* on all platforms.
				425	*
				426	* It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
				427	*
				428	* Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
				429	* XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
				430	* explained in the implementation.
				431	*
				432	* Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
				433	* ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
				434	*
				435	* XXH3 offers 2 variants, _64bits and _128bits.
				436	* When only 64 bits are needed, prefer calling the _64bits variant, as it
				437	* reduces the amount of mixing, resulting in faster speed on small inputs.
				438	*
				439	* It's also generally simpler to manipulate a scalar return type than a struct.
				440	*
				441	* The 128-bit version adds additional strength, but it is slightly slower.
				442	*
				443	* The XXH3 algorithm is still in development.
				444	* The results it produces may still change in future versions.
				445	*
				446	* Results produced by v0.7.x are not comparable with results from v0.7.y.
				447	* However, the API is completely stable, and it can safely be used for
				448	* ephemeral data (local sessions).
				449	*
				450	* Avoid storing values in long-term storage until the algorithm is finalized.
				451	* XXH3's return values will be officially finalized upon reaching v0.8.0.
				452	*
				453	* After which, return values of XXH3 and XXH128 will no longer change in
				454	* future versions.
				455	*
				456	* The API supports one-shot hashing, streaming mode, and custom secrets.
				457	*/
				458
				459	/* XXH3_64bits():
				460	* default 64-bit variant, using default secret and default seed of 0.
				461	* It's the fastest variant. */
				462	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
				463
				464	/*
				465	* XXH3_64bits_withSeed():
				466	* This variant generates a custom secret on the fly
				467	* based on default secret altered using the `seed` value.
				468	* While this operation is decently fast, note that it's not completely free.
				469	* Note: seed==0 produces the same results as XXH3_64bits().
				470	*/
				471	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
				472
				473	/*
				474	* XXH3_64bits_withSecret():
				475	* It's possible to provide any blob of bytes as a "secret" to generate the hash.
				476	* This makes it more difficult for an external actor to prepare an intentional collision.
				477	* The main condition is that secretSize must be large enough (>= XXH3_SECRET_SIZE_MIN).
				478	* However, the quality of produced hash values depends on secret's entropy.
				479	* Technically, the secret must look like a bunch of random bytes.
				480	* Avoid "trivial" or structured data such as repeated sequences or a text document.
				481	* Whenever unsure about the "randomness" of the blob of bytes,
				482	* consider relabelling it as a "custom seed" instead,
				483	* and employ "XXH3_generateSecret()" (see below)
				484	* to generate a high entropy secret derived from the custom seed.
				485	*/
				486	#define XXH3_SECRET_SIZE_MIN 136
				487	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
				488
				489
				490	/***** Streaming *****/
				491	/*
				492	* Streaming requires state maintenance.
				493	* This operation costs memory and CPU.
				494	* As a consequence, streaming is slower than one-shot hashing.
				495	* For better performance, prefer one-shot functions whenever applicable.
				496	*/
				497	typedef struct XXH3_state_s XXH3_state_t;
				498	XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
				499	XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
				500	XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
				501
				502	/*
				503	* XXH3_64bits_reset():
				504	* Initialize with default parameters.
				505	* digest will be equivalent to `XXH3_64bits()`.
				506	*/
				507	XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
				508	/*
				509	* XXH3_64bits_reset_withSeed():
				510	* Generate a custom secret from `seed`, and store it into `statePtr`.
				511	* digest will be equivalent to `XXH3_64bits_withSeed()`.
				512	*/
				513	XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
				514	/*
				515	* XXH3_64bits_reset_withSecret():
				516	* `secret` is referenced, it _must outlive_ the hash streaming session.
				517	* Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
				518	* and the quality of produced hash values depends on secret's entropy
				519	* (secret's content should look like a bunch of random bytes).
				520	* When in doubt about the randomness of a candidate `secret`,
				521	* consider employing `XXH3_generateSecret()` instead (see below).
				522	*/
				523	XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
				524
				525	XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
				526	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);
				527
				528	/* note : canonical representation of XXH3 is the same as XXH64
				529	* since they both produce XXH64_hash_t values */
				530
				531
				532	/-*********************************************************************
				533	* XXH3 128-bit variant
				534	************************************************************************/
				535
				536	typedef struct {
				537	XXH64_hash_t low64;
				538	XXH64_hash_t high64;
				539	} XXH128_hash_t;
				540
				541	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
				542	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
				543	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
				544
				545	/***** Streaming *****/
				546	/*
				547	* Streaming requires state maintenance.
				548	* This operation costs memory and CPU.
				549	* As a consequence, streaming is slower than one-shot hashing.
				550	* For better performance, prefer one-shot functions whenever applicable.
				551	*
				552	* XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
				553	* Use already declared XXH3_createState() and XXH3_freeState().
				554	*
				555	* All reset and streaming functions have same meaning as their 64-bit counterpart.
				556	*/
				557
				558	XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
				559	XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
				560	XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
				561
				562	XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
				563	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
				564
				565	/* Following helper functions make it possible to compare XXH128_hast_t values.
				566	* Since XXH128_hash_t is a structure, this capability is not offered by the language.
				567	* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
				568
				569	/*!
				570	* XXH128_isEqual():
				571	* Return: 1 if `h1` and `h2` are equal, 0 if they are not.
				572	*/
				573	XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
				574
				575	/*!
				576	* XXH128_cmp():
				577	*
				578	* This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
				579	*
				580	* return: >0 if h128_1 > h128_2
				581	* =0 if h128_1 == h128_2
				582	* <0 if h128_1 < h128_2
				583	*/
				584	XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
				585
				586
				587	/***** Canonical representation *****/
				588	typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
				589	XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
				590	XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
				591
				592
				593	#endif /* XXH_NO_LONG_LONG */
				594
				595	#endif /* XXHASH_H_5627135585666179 */
				596
				597
				598
				599	#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
				600	#define XXHASH_H_STATIC_13879238742
				601	/* ****************************************************************************
				602	* This section contains declarations which are not guaranteed to remain stable.
				603	* They may change in future versions, becoming incompatible with a different
				604	* version of the library.
				605	* These declarations should only be used with static linking.
				606	* Never use them in association with dynamic linking!
				607	***************************************************************************** */
				608
				609	/*
				610	* These definitions are only present to allow static allocation
				611	* of XXH states, on stack or in a struct, for example.
				612	* Never ever access their members directly.
				613	*/
				614
				615	struct XXH32_state_s {
				616	XXH32_hash_t total_len_32;
				617	XXH32_hash_t large_len;
				618	XXH32_hash_t v1;
				619	XXH32_hash_t v2;
				620	XXH32_hash_t v3;
				621	XXH32_hash_t v4;
				622	XXH32_hash_t mem32[4];
				623	XXH32_hash_t memsize;
				624	XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */
				625	}; /* typedef'd to XXH32_state_t */
				626
				627
				628	#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */
				629
				630	struct XXH64_state_s {
				631	XXH64_hash_t total_len;
				632	XXH64_hash_t v1;
				633	XXH64_hash_t v2;
				634	XXH64_hash_t v3;
				635	XXH64_hash_t v4;
				636	XXH64_hash_t mem64[4];
				637	XXH32_hash_t memsize;
				638	XXH32_hash_t reserved32; /* required for padding anyway */
				639	XXH64_hash_t reserved64; /* never read nor write, might be removed in a future version */
				640	}; /* typedef'd to XXH64_state_t */
				641
				642	#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */
				643	# include <stdalign.h>
				644	# define XXH_ALIGN(n) alignas(n)
				645	#elif defined(__GNUC__)
				646	# define XXH_ALIGN(n) __attribute__ ((aligned(n)))
				647	#elif defined(_MSC_VER)
				648	# define XXH_ALIGN(n) __declspec(align(n))
				649	#else
				650	# define XXH_ALIGN(n) /* disabled */
				651	#endif
				652
				653	/* Old GCC versions only accept the attribute after the type in structures. */
				654	#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
				655	&& defined(__GNUC__)
				656	# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
				657	#else
				658	# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
				659	#endif
				660
				661	#define XXH3_INTERNALBUFFER_SIZE 256
				662	#define XXH3_SECRET_DEFAULT_SIZE 192
				663	struct XXH3_state_s {
				664	XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
				665	/* used to store a custom secret generated from a seed */
				666	XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
				667	XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
				668	XXH32_hash_t bufferedSize;
				669	XXH32_hash_t reserved32;
				670	size_t nbStripesSoFar;
				671	XXH64_hash_t totalLen;
				672	size_t nbStripesPerBlock;
				673	size_t secretLimit;
				674	XXH64_hash_t seed;
				675	XXH64_hash_t reserved64;
				676	const unsigned char* extSecret; /* reference to external secret;
				677	* if == NULL, use .customSecret instead */
				678	/* note: there may be some padding at the end due to alignment on 64 bytes */
				679	}; /* typedef'd to XXH3_state_t */
				680
				681	#undef XXH_ALIGN_MEMBER
				682
				683	/* When the XXH3_state_t structure is merely emplaced on stack,
				684	* it should be initialized with XXH3_INITSTATE() or a memset()
				685	* in case its first reset uses XXH3_NNbits_reset_withSeed().
				686	* This init can be omitted if the first reset uses default or _withSecret mode.
				687	* This operation isn't necessary when the state is created with XXH3_createState().
				688	* Note that this doesn't prepare the state for a streaming operation,
				689	* it's still necessary to use XXH3_NNbits_reset*() afterwards.
				690	*/
				691	#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }
				692
				693
				694	/* === Experimental API === */
				695	/* Symbols defined below must be considered tied to a specific library version. */
				696
				697	/*
				698	* XXH3_generateSecret():
				699	*
				700	* Derive a high-entropy secret from any user-defined content, named customSeed.
				701	* The generated secret can be used in combination with `*_withSecret()` functions.
				702	* The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
				703	* as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
				704	*
				705	* The function accepts as input a custom seed of any length and any content,
				706	* and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
				707	* into an already allocated buffer secretBuffer.
				708	* The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
				709	*
				710	* The generated secret can then be used with any `*_withSecret()` variant.
				711	* Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
				712	* `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
				713	* are part of this list. They all accept a `secret` parameter
				714	* which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
				715	* _and_ feature very high entropy (consist of random-looking bytes).
				716	* These conditions can be a high bar to meet, so
				717	* this function can be used to generate a secret of proper quality.
				718	*
				719	* customSeed can be anything. It can have any size, even small ones,
				720	* and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.
				721	* The resulting `secret` will nonetheless provide all expected qualities.
				722	*
				723	* Supplying NULL as the customSeed copies the default secret into `secretBuffer`.
				724	* When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
				725	*/
				726	XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);
				727
				728
				729	/* simple short-cut to pre-selected XXH3_128bits variant */
				730	XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
				731
				732
				733	#endif /* XXH_NO_LONG_LONG */
				734
				735
				736	#if defined(XXH_INLINE_ALL) \|\| defined(XXH_PRIVATE_API)
				737	# define XXH_IMPLEMENTATION
				738	#endif
				739
				740	#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
				741
				742
				743	/* ======================================================================== */
				744	/* ======================================================================== */
				745	/* ======================================================================== */
				746
				747
				748	/-*********************************************************************
				749	* xxHash implementation
				750	-*********************************************************************
				751	* xxHash's implementation used to be hosted inside xxhash.c.
				752	*
				753	* However, inlining requires implementation to be visible to the compiler,
				754	* hence be included alongside the header.
				755	* Previously, implementation was hosted inside xxhash.c,
				756	* which was then #included when inlining was activated.
				757	* This construction created issues with a few build and install systems,
				758	* as it required xxhash.c to be stored in /include directory.
				759	*
				760	* xxHash implementation is now directly integrated within xxhash.h.
				761	* As a consequence, xxhash.c is no longer needed in /include.
				762	*
				763	* xxhash.c is still available and is still useful.
				764	* In a "normal" setup, when xxhash is not inlined,
				765	* xxhash.h only exposes the prototypes and public symbols,
				766	* while xxhash.c can be built into an object file xxhash.o
				767	* which can then be linked into the final binary.
				768	************************************************************************/
				769
				770	#if ( defined(XXH_INLINE_ALL) \|\| defined(XXH_PRIVATE_API) \
				771	\|\| defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
				772	# define XXH_IMPLEM_13a8737387
				773
				774	/* *************************************
				775	* Tuning parameters
				776	***************************************/
				777	/*!
				778	* XXH_FORCE_MEMORY_ACCESS:
				779	* By default, access to unaligned memory is controlled by `memcpy()`, which is
				780	* safe and portable.
				781	*
				782	* Unfortunately, on some target/compiler combinations, the generated assembly
				783	* is sub-optimal.
				784	*
				785	* The below switch allow selection of a different access method
				786	* in the search for improved performance.
				787	* Method 0 (default):
				788	* Use `memcpy()`. Safe and portable. Default.
				789	* Method 1:
				790	* `__attribute__((packed))` statement. It depends on compiler extensions
				791	* and is therefore not portable.
				792	* This method is safe if your compiler supports it, and generally as
				793	* fast or faster than `memcpy`.
				794	* Method 2:
				795	* Direct access via cast. This method doesn't depend on the compiler but
				796	* violates the C standard.
				797	* It can generate buggy code on targets which do not support unaligned
				798	* memory accesses.
				799	* But in some circumstances, it's the only known way to get the most
				800	* performance (example: GCC + ARMv6)
				801	* Method 3:
				802	* Byteshift. This can generate the best code on old compilers which don't
				803	* inline small `memcpy()` calls, and it might also be faster on big-endian
				804	* systems which lack a native byteswap instruction.
				805	* See https://stackoverflow.com/a/32095106/646947 for details.
				806	* Prefer these methods in priority order (0 > 1 > 2 > 3)
				807	*/
				808	#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
				809	# if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
				810	# define XXH_FORCE_MEMORY_ACCESS 2
				811	# elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) \|\| \
				812	(defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
				813	# define XXH_FORCE_MEMORY_ACCESS 1
				814	# endif
				815	#endif
				816
				817	/*!
				818	* XXH_ACCEPT_NULL_INPUT_POINTER:
				819	* If the input pointer is NULL, xxHash's default behavior is to dereference it,
				820	* triggering a segfault.
				821	* When this macro is enabled, xxHash actively checks the input for a null pointer.
				822	* If it is, the result for null input pointers is the same as a zero-length input.
				823	*/
				824	#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */
				825	# define XXH_ACCEPT_NULL_INPUT_POINTER 0
				826	#endif
				827
				828	/*!
				829	* XXH_FORCE_ALIGN_CHECK:
				830	* This is an important performance trick
				831	* for architectures without decent unaligned memory access performance.
				832	* It checks for input alignment, and when conditions are met,
				833	* uses a "fast path" employing direct 32-bit/64-bit read,
				834	* resulting in _dramatically faster_ read speed.
				835	*
				836	* The check costs one initial branch per hash, which is generally negligible, but not zero.
				837	* Moreover, it's not useful to generate binary for an additional code path
				838	* if memory access uses same instruction for both aligned and unaligned adresses.
				839	*
				840	* In these cases, the alignment check can be removed by setting this macro to 0.
				841	* Then the code will always use unaligned memory access.
				842	* Align check is automatically disabled on x86, x64 & arm64,
				843	* which are platforms known to offer good unaligned memory accesses performance.
				844	*
				845	* This option does not affect XXH3 (only XXH32 and XXH64).
				846	*/
				847	#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
				848	# if defined(__i386) \|\| defined(__x86_64__) \|\| defined(__aarch64__) \
				849	\|\| defined(_M_IX86) \|\| defined(_M_X64) \|\| defined(_M_ARM64) /* visual */
				850	# define XXH_FORCE_ALIGN_CHECK 0
				851	# else
				852	# define XXH_FORCE_ALIGN_CHECK 1
				853	# endif
				854	#endif
				855
				856	/*!
				857	* XXH_NO_INLINE_HINTS:
				858	*
				859	* By default, xxHash tries to force the compiler to inline almost all internal
				860	* functions.
				861	*
				862	* This can usually improve performance due to reduced jumping and improved
				863	* constant folding, but significantly increases the size of the binary which
				864	* might not be favorable.
				865	*
				866	* Additionally, sometimes the forced inlining can be detrimental to performance,
				867	* depending on the architecture.
				868	*
				869	* XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
				870	* compiler full control on whether to inline or not.
				871	*
				872	* When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
				873	* -fno-inline with GCC or Clang, this will automatically be defined.
				874	*/
				875	#ifndef XXH_NO_INLINE_HINTS
				876	# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
				877	\|\| defined(__NO_INLINE__) /* -O0, -fno-inline */
				878	# define XXH_NO_INLINE_HINTS 1
				879	# else
				880	# define XXH_NO_INLINE_HINTS 0
				881	# endif
				882	#endif
				883
				884	/*!
				885	* XXH_REROLL:
				886	* Whether to reroll XXH32_finalize, and XXH64_finalize,
				887	* instead of using an unrolled jump table/if statement loop.
				888	*
				889	* This is automatically defined on -Os/-Oz on GCC and Clang.
				890	*/
				891	#ifndef XXH_REROLL
				892	# if defined(__OPTIMIZE_SIZE__)
				893	# define XXH_REROLL 1
				894	# else
				895	# define XXH_REROLL 0
				896	# endif
				897	#endif
				898
				899
				900	/* *************************************
				901	* Includes & Memory related functions
				902	***************************************/
				903	/*!
				904	* Modify the local functions below should you wish to use
				905	* different memory routines for malloc() and free()
				906	*/
				907	#include <stdlib.h>
				908
				909	static void* XXH_malloc(size_t s) { return malloc(s); }
				910	static void XXH_free(void* p) { free(p); }
				911
				912	/! and for memcpy() /
				913	#include <string.h>
				914	static void* XXH_memcpy(void* dest, const void* src, size_t size)
				915	{
				916	return memcpy(dest,src,size);
				917	}
				918
				919	#include <limits.h> /* ULLONG_MAX */
				920
				921
				922	/* *************************************
				923	* Compiler Specific Options
				924	***************************************/
				925	#ifdef _MSC_VER /* Visual Studio warning fix */
				926	# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
				927	#endif
				928
				929	#if XXH_NO_INLINE_HINTS /* disable inlining hints */
				930	# if defined(__GNUC__)
				931	# define XXH_FORCE_INLINE static __attribute__((unused))
				932	# else
				933	# define XXH_FORCE_INLINE static
				934	# endif
				935	# define XXH_NO_INLINE static
				936	/* enable inlining hints */
				937	#elif defined(_MSC_VER) /* Visual Studio */
				938	# define XXH_FORCE_INLINE static __forceinline
				939	# define XXH_NO_INLINE static __declspec(noinline)
				940	#elif defined(__GNUC__)
				941	# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
				942	# define XXH_NO_INLINE static __attribute__((noinline))
				943	#elif defined (__cplusplus) \
				944	\|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */
				945	# define XXH_FORCE_INLINE static inline
				946	# define XXH_NO_INLINE static
				947	#else
				948	# define XXH_FORCE_INLINE static
				949	# define XXH_NO_INLINE static
				950	#endif
				951
				952
				953
				954	/* *************************************
				955	* Debug
				956	***************************************/
				957	/*
				958	* XXH_DEBUGLEVEL is expected to be defined externally, typically via the
				959	* compiler's command line options. The value must be a number.
				960	*/
				961	#ifndef XXH_DEBUGLEVEL
				962	# ifdef DEBUGLEVEL /* backwards compat */
				963	# define XXH_DEBUGLEVEL DEBUGLEVEL
				964	# else
				965	# define XXH_DEBUGLEVEL 0
				966	# endif
				967	#endif
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	968
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	969	#if (XXH_DEBUGLEVEL>=1)
				970	# include <assert.h> /* note: can still be disabled with NDEBUG */
				971	# define XXH_ASSERT(c) assert(c)
				972	#else
				973	# define XXH_ASSERT(c) ((void)0)
				974	#endif
				975
				976	/* note: use after variable declarations */
				977	#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
				978
				979
				980	/* *************************************
				981	* Basic Types
				982	***************************************/
				983	#if !defined (__VMS) \
				984	&& (defined (__cplusplus) \
				985	\|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
				986	# include <stdint.h>
				987	typedef uint8_t xxh_u8;
				988	#else
				989	typedef unsigned char xxh_u8;
				990	#endif
				991	typedef XXH32_hash_t xxh_u32;
				992
				993	#ifdef XXH_OLD_NAMES
				994	# define BYTE xxh_u8
				995	# define U8 xxh_u8
				996	# define U32 xxh_u32
				997	#endif
				998
				999	/* * Memory access * */
				1000
				1001	#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
				1002	/*
				1003	* Manual byteshift. Best for old compilers which don't inline memcpy.
				1004	* We actually directly use XXH_readLE32 and XXH_readBE32.
				1005	*/
				1006	#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
				1007
				1008	/*
				1009	* Force direct memory access. Only works on CPU which support unaligned memory
				1010	* access in hardware.
				1011	*/
				1012	static xxh_u32 XXH_read32(const void* memPtr) { return (const xxh_u32) memPtr; }
				1013
				1014	#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
				1015
				1016	/*
				1017	* __pack instructions are safer but compiler specific, hence potentially
				1018	* problematic for some compilers.
				1019	*
				1020	* Currently only defined for GCC and ICC.
				1021	*/
				1022	#ifdef XXH_OLD_NAMES
				1023	typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
				1024	#endif
				1025	static xxh_u32 XXH_read32(const void* ptr)
				1026	{
				1027	typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
				1028	return ((const xxh_unalign*)ptr)->u32;
				1029	}
				1030
				1031	#else
				1032
				1033	/*
				1034	* Portable and safe solution. Generally efficient.
				1035	* see: https://stackoverflow.com/a/32095106/646947
				1036	*/
				1037	static xxh_u32 XXH_read32(const void* memPtr)
				1038	{
				1039	xxh_u32 val;
				1040	memcpy(&val, memPtr, sizeof(val));
				1041	return val;
				1042	}
				1043
				1044	#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
				1045
				1046
				1047	/* * Endianess * */
				1048	typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
				1049
				1050	/*!
				1051	* XXH_CPU_LITTLE_ENDIAN:
				1052	* Defined to 1 if the target is little endian, or 0 if it is big endian.
				1053	* It can be defined externally, for example on the compiler command line.
				1054	*
				1055	* If it is not defined, a runtime check (which is usually constant folded)
				1056	* is used instead.
				1057	*/
				1058	#ifndef XXH_CPU_LITTLE_ENDIAN
				1059	/*
				1060	* Try to detect endianness automatically, to avoid the nonstandard behavior
				1061	* in `XXH_isLittleEndian()`
				1062	*/
				1063	# if defined(_WIN32) /* Windows is always little endian */ \
				1064	\|\| defined(__LITTLE_ENDIAN__) \
				1065	\|\| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
				1066	# define XXH_CPU_LITTLE_ENDIAN 1
				1067	# elif defined(__BIG_ENDIAN__) \
				1068	\|\| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
				1069	# define XXH_CPU_LITTLE_ENDIAN 0
				1070	# else
				1071	/*
				1072	* runtime test, presumed to simplify to a constant by compiler
				1073	*/
				1074	static int XXH_isLittleEndian(void)
				1075	{
				1076	/*
				1077	* Portable and well-defined behavior.
				1078	* Don't use static: it is detrimental to performance.
				1079	*/
				1080	const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
				1081	return one.c[0];
				1082	}
				1083	# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
				1084	# endif
				1085	#endif
				1086
				1087
				1088
				1089
				1090	/* ****************************************
				1091	* Compiler-specific Functions and Macros
				1092	******************************************/
				1093	#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
				1094
				1095	#ifdef __has_builtin
				1096	# define XXH_HAS_BUILTIN(x) __has_builtin(x)
				1097	#else
				1098	# define XXH_HAS_BUILTIN(x) 0
				1099	#endif
				1100
				1101	#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
				1102	&& XXH_HAS_BUILTIN(__builtin_rotateleft64)
				1103	# define XXH_rotl32 __builtin_rotateleft32
				1104	# define XXH_rotl64 __builtin_rotateleft64
				1105	/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
				1106	#elif defined(_MSC_VER)
				1107	# define XXH_rotl32(x,r) _rotl(x,r)
				1108	# define XXH_rotl64(x,r) _rotl64(x,r)
				1109	#else
				1110	# define XXH_rotl32(x,r) (((x) << (r)) \| ((x) >> (32 - (r))))
				1111	# define XXH_rotl64(x,r) (((x) << (r)) \| ((x) >> (64 - (r))))
				1112	#endif
				1113
				1114	#if defined(_MSC_VER) /* Visual Studio */
				1115	# define XXH_swap32 _byteswap_ulong
				1116	#elif XXH_GCC_VERSION >= 403
				1117	# define XXH_swap32 __builtin_bswap32
				1118	#else
				1119	static xxh_u32 XXH_swap32 (xxh_u32 x)
				1120	{
				1121	return ((x << 24) & 0xff000000 ) \|
				1122	((x << 8) & 0x00ff0000 ) \|
				1123	((x >> 8) & 0x0000ff00 ) \|
				1124	((x >> 24) & 0x000000ff );
				1125	}
				1126	#endif
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1127
				1128
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1129	/* ***************************
				1130	* Memory reads
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1131	*****************************/
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1132	typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1133
				1134	/*
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1135	* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
				1136	*
				1137	* This is ideal for older compilers which don't inline memcpy.
				1138	*/
				1139	#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1140
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1141	XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
				1142	{
				1143	const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
				1144	return bytePtr[0]
				1145	\| ((xxh_u32)bytePtr[1] << 8)
				1146	\| ((xxh_u32)bytePtr[2] << 16)
				1147	\| ((xxh_u32)bytePtr[3] << 24);
				1148	}
				1149
				1150	XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
				1151	{
				1152	const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
				1153	return bytePtr[3]
				1154	\| ((xxh_u32)bytePtr[2] << 8)
				1155	\| ((xxh_u32)bytePtr[1] << 16)
				1156	\| ((xxh_u32)bytePtr[0] << 24);
				1157	}
				1158
				1159	#else
				1160	XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
				1161	{
				1162	return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
				1163	}
				1164
				1165	static xxh_u32 XXH_readBE32(const void* ptr)
				1166	{
				1167	return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
				1168	}
				1169	#endif
				1170
				1171	XXH_FORCE_INLINE xxh_u32
				1172	XXH_readLE32_align(const void* ptr, XXH_alignment align)
				1173	{
				1174	if (align==XXH_unaligned) {
				1175	return XXH_readLE32(ptr);
				1176	} else {
				1177	return XXH_CPU_LITTLE_ENDIAN ? (const xxh_u32)ptr : XXH_swap32((const xxh_u32)ptr);
				1178	}
				1179	}
				1180
				1181
				1182	/* *************************************
				1183	* Misc
				1184	***************************************/
				1185	XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
				1186
				1187
				1188	/* *******************************************************************
				1189	* 32-bit hash functions
				1190	*********************************************************************/
				1191	static const xxh_u32 XXH_PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */
				1192	static const xxh_u32 XXH_PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */
				1193	static const xxh_u32 XXH_PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */
				1194	static const xxh_u32 XXH_PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */
				1195	static const xxh_u32 XXH_PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */
				1196
				1197	#ifdef XXH_OLD_NAMES
				1198	# define PRIME32_1 XXH_PRIME32_1
				1199	# define PRIME32_2 XXH_PRIME32_2
				1200	# define PRIME32_3 XXH_PRIME32_3
				1201	# define PRIME32_4 XXH_PRIME32_4
				1202	# define PRIME32_5 XXH_PRIME32_5
				1203	#endif
				1204
				1205	static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
				1206	{
				1207	acc += input * XXH_PRIME32_2;
				1208	acc = XXH_rotl32(acc, 13);
				1209	acc *= XXH_PRIME32_1;
				1210	#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
				1211	/*
				1212	* UGLY HACK:
				1213	* This inline assembly hack forces acc into a normal register. This is the
				1214	* only thing that prevents GCC and Clang from autovectorizing the XXH32
				1215	* loop (pragmas and attributes don't work for some resason) without globally
				1216	* disabling SSE4.1.
				1217	*
				1218	* The reason we want to avoid vectorization is because despite working on
				1219	* 4 integers at a time, there are multiple factors slowing XXH32 down on
				1220	* SSE4:
				1221	* - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
				1222	* newer chips!) making it slightly slower to multiply four integers at
				1223	* once compared to four integers independently. Even when pmulld was
				1224	* fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
				1225	* just to multiply unless doing a long operation.
				1226	*
				1227	* - Four instructions are required to rotate,
				1228	* movqda tmp, v // not required with VEX encoding
				1229	* pslld tmp, 13 // tmp <<= 13
				1230	* psrld v, 19 // x >>= 19
				1231	* por v, tmp // x \|= tmp
				1232	* compared to one for scalar:
				1233	* roll v, 13 // reliably fast across the board
				1234	* shldl v, v, 13 // Sandy Bridge and later prefer this for some reason
				1235	*
				1236	* - Instruction level parallelism is actually more beneficial here because
				1237	* the SIMD actually serializes this operation: While v1 is rotating, v2
				1238	* can load data, while v3 can multiply. SSE forces them to operate
				1239	* together.
				1240	*
				1241	* How this hack works:
				1242	* __asm__("" // Declare an assembly block but don't declare any instructions
				1243	* : // However, as an Input/Output Operand,
				1244	* "+r" // constrain a read/write operand (+) as a general purpose register (r).
				1245	* (acc) // and set acc as the operand
				1246	* );
				1247	*
				1248	* Because of the 'r', the compiler has promised that seed will be in a
				1249	* general purpose register and the '+' says that it will be 'read/write',
				1250	* so it has to assume it has changed. It is like volatile without all the
				1251	* loads and stores.
				1252	*
				1253	* Since the argument has to be in a normal register (not an SSE register),
				1254	* each time XXH32_round is called, it is impossible to vectorize.
				1255	*/
				1256	__asm__("" : "+r" (acc));
				1257	#endif
				1258	return acc;
				1259	}
				1260
				1261	/* mix all bits */
				1262	static xxh_u32 XXH32_avalanche(xxh_u32 h32)
				1263	{
				1264	h32 ^= h32 >> 15;
				1265	h32 *= XXH_PRIME32_2;
				1266	h32 ^= h32 >> 13;
				1267	h32 *= XXH_PRIME32_3;
				1268	h32 ^= h32 >> 16;
				1269	return(h32);
				1270	}
				1271
				1272	#define XXH_get32bits(p) XXH_readLE32_align(p, align)
				1273
				1274	static xxh_u32
				1275	XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
				1276	{
				1277	#define XXH_PROCESS1 do { \
				1278	h32 += (ptr++) XXH_PRIME32_5; \
				1279	h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \
				1280	} while (0)
				1281
				1282	#define XXH_PROCESS4 do { \
				1283	h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \
				1284	ptr += 4; \
				1285	h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \
				1286	} while (0)
				1287
				1288	/* Compact rerolled version */
				1289	if (XXH_REROLL) {
				1290	len &= 15;
				1291	while (len >= 4) {
				1292	XXH_PROCESS4;
				1293	len -= 4;
				1294	}
				1295	while (len > 0) {
				1296	XXH_PROCESS1;
				1297	--len;
				1298	}
				1299	return XXH32_avalanche(h32);
				1300	} else {
				1301	switch(len&15) /* or switch(bEnd - p) */ {
				1302	case 12: XXH_PROCESS4;
				1303	/* fallthrough */
				1304	case 8: XXH_PROCESS4;
				1305	/* fallthrough */
				1306	case 4: XXH_PROCESS4;
				1307	return XXH32_avalanche(h32);
				1308
				1309	case 13: XXH_PROCESS4;
				1310	/* fallthrough */
				1311	case 9: XXH_PROCESS4;
				1312	/* fallthrough */
				1313	case 5: XXH_PROCESS4;
				1314	XXH_PROCESS1;
				1315	return XXH32_avalanche(h32);
				1316
				1317	case 14: XXH_PROCESS4;
				1318	/* fallthrough */
				1319	case 10: XXH_PROCESS4;
				1320	/* fallthrough */
				1321	case 6: XXH_PROCESS4;
				1322	XXH_PROCESS1;
				1323	XXH_PROCESS1;
				1324	return XXH32_avalanche(h32);
				1325
				1326	case 15: XXH_PROCESS4;
				1327	/* fallthrough */
				1328	case 11: XXH_PROCESS4;
				1329	/* fallthrough */
				1330	case 7: XXH_PROCESS4;
				1331	/* fallthrough */
				1332	case 3: XXH_PROCESS1;
				1333	/* fallthrough */
				1334	case 2: XXH_PROCESS1;
				1335	/* fallthrough */
				1336	case 1: XXH_PROCESS1;
				1337	/* fallthrough */
				1338	case 0: return XXH32_avalanche(h32);
				1339	}
				1340	XXH_ASSERT(0);
				1341	return h32; /* reaching this point is deemed impossible */
				1342	}
				1343	}
				1344
				1345	#ifdef XXH_OLD_NAMES
				1346	# define PROCESS1 XXH_PROCESS1
				1347	# define PROCESS4 XXH_PROCESS4
				1348	#else
				1349	# undef XXH_PROCESS1
				1350	# undef XXH_PROCESS4
				1351	#endif
				1352
				1353	XXH_FORCE_INLINE xxh_u32
				1354	XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
				1355	{
				1356	const xxh_u8* bEnd = input + len;
				1357	xxh_u32 h32;
				1358
				1359	#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
				1360	if (input==NULL) {
				1361	len=0;
				1362	bEnd=input=(const xxh_u8*)(size_t)16;
				1363	}
				1364	#endif
				1365
				1366	if (len>=16) {
				1367	const xxh_u8* const limit = bEnd - 15;
				1368	xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
				1369	xxh_u32 v2 = seed + XXH_PRIME32_2;
				1370	xxh_u32 v3 = seed + 0;
				1371	xxh_u32 v4 = seed - XXH_PRIME32_1;
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1372
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1373	do {
				1374	v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
				1375	v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
				1376	v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
				1377	v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
				1378	} while (input < limit);
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1379
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1380	h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)
				1381	+ XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
				1382	} else {
				1383	h32 = seed + XXH_PRIME32_5;
				1384	}
				1385
				1386	h32 += (xxh_u32)len;
				1387
				1388	return XXH32_finalize(h32, input, len&15, align);
				1389	}
				1390
				1391
				1392	XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
				1393	{
				1394	#if 0
				1395	/* Simple version, good for code maintenance, but unfortunately slow for small inputs */
				1396	XXH32_state_t state;
				1397	XXH32_reset(&state, seed);
				1398	XXH32_update(&state, (const xxh_u8*)input, len);
				1399	return XXH32_digest(&state);
				1400
				1401	#else
				1402
				1403	if (XXH_FORCE_ALIGN_CHECK) {
				1404	if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */
				1405	return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
				1406	} }
				1407
				1408	return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
				1409	#endif
				1410	}
				1411
				1412
				1413
				1414	/***** Hash streaming *****/
				1415
				1416	XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
				1417	{
				1418	return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
				1419	}
				1420	XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
				1421	{
				1422	XXH_free(statePtr);
				1423	return XXH_OK;
				1424	}
				1425
				1426	XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
				1427	{
				1428	memcpy(dstState, srcState, sizeof(*dstState));
				1429	}
				1430
				1431	XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
				1432	{
				1433	XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
				1434	memset(&state, 0, sizeof(state));
				1435	state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
				1436	state.v2 = seed + XXH_PRIME32_2;
				1437	state.v3 = seed + 0;
				1438	state.v4 = seed - XXH_PRIME32_1;
				1439	/* do not write into reserved, planned to be removed in a future version */
				1440	memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
				1441	return XXH_OK;
				1442	}
				1443
				1444
				1445	XXH_PUBLIC_API XXH_errorcode
				1446	XXH32_update(XXH32_state_t* state, const void* input, size_t len)
				1447	{
				1448	if (input==NULL)
				1449	#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
				1450	return XXH_OK;
				1451	#else
				1452	return XXH_ERROR;
				1453	#endif
				1454
				1455	{ const xxh_u8* p = (const xxh_u8*)input;
				1456	const xxh_u8* const bEnd = p + len;
				1457
				1458	state->total_len_32 += (XXH32_hash_t)len;
				1459	state->large_len \|= (XXH32_hash_t)((len>=16) \| (state->total_len_32>=16));
				1460
				1461	if (state->memsize + len < 16) { /* fill in tmp buffer */
				1462	XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
				1463	state->memsize += (XXH32_hash_t)len;
				1464	return XXH_OK;
				1465	}
				1466
				1467	if (state->memsize) { /* some data left from previous update */
				1468	XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
				1469	{ const xxh_u32* p32 = state->mem32;
				1470	state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
				1471	state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
				1472	state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
				1473	state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
				1474	}
				1475	p += 16-state->memsize;
				1476	state->memsize = 0;
				1477	}
				1478
				1479	if (p <= bEnd-16) {
				1480	const xxh_u8* const limit = bEnd - 16;
				1481	xxh_u32 v1 = state->v1;
				1482	xxh_u32 v2 = state->v2;
				1483	xxh_u32 v3 = state->v3;
				1484	xxh_u32 v4 = state->v4;
				1485
				1486	do {
				1487	v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
				1488	v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
				1489	v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
				1490	v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
				1491	} while (p<=limit);
				1492
				1493	state->v1 = v1;
				1494	state->v2 = v2;
				1495	state->v3 = v3;
				1496	state->v4 = v4;
				1497	}
				1498
				1499	if (p < bEnd) {
				1500	XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
				1501	state->memsize = (unsigned)(bEnd-p);
				1502	}
				1503	}
				1504
				1505	return XXH_OK;
				1506	}
				1507
				1508
				1509	XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
				1510	{
				1511	xxh_u32 h32;
				1512
				1513	if (state->large_len) {
				1514	h32 = XXH_rotl32(state->v1, 1)
				1515	+ XXH_rotl32(state->v2, 7)
				1516	+ XXH_rotl32(state->v3, 12)
				1517	+ XXH_rotl32(state->v4, 18);
				1518	} else {
				1519	h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
				1520	}
				1521
				1522	h32 += state->total_len_32;
				1523
				1524	return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
				1525	}
				1526
				1527
				1528	/***** Canonical representation *****/
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1529
				1530	/*
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1531	* The default return values from XXH functions are unsigned 32 and 64 bit
				1532	* integers.
				1533	*
				1534	* The canonical representation uses big endian convention, the same convention
				1535	* as human-readable numbers (large digits first).
				1536	*
				1537	* This way, hash values can be written into a file or buffer, remaining
				1538	* comparable across different systems.
				1539	*
				1540	* The following functions allow transformation of hash values to and from their
				1541	* canonical format.
				1542	*/
				1543	XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
				1544	{
				1545	XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
				1546	if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
				1547	memcpy(dst, &hash, sizeof(*dst));
				1548	}
				1549
				1550	XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
				1551	{
				1552	return XXH_readBE32(src);
				1553	}
				1554
				1555
				1556	#ifndef XXH_NO_LONG_LONG
				1557
				1558	/* *******************************************************************
				1559	* 64-bit hash functions
				1560	*********************************************************************/
				1561
				1562	/***** Memory access *****/
				1563
				1564	typedef XXH64_hash_t xxh_u64;
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1565
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1566	#ifdef XXH_OLD_NAMES
				1567	# define U64 xxh_u64
				1568	#endif
				1569
				1570	/*!
				1571	* XXH_REROLL_XXH64:
				1572	* Whether to reroll the XXH64_finalize() loop.
				1573	*
				1574	* Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
				1575	* performance gain on 64-bit hosts, as only one jump is required.
				1576	*
				1577	* However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit
				1578	* registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial
				1579	* to unroll. The code becomes ridiculously large (the largest function in the
				1580	* binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is
				1581	* also slightly faster because it fits into cache better and is more likely
				1582	* to be inlined by the compiler.
				1583	*
				1584	* If XXH_REROLL is defined, this is ignored and the loop is always rerolled.
				1585	*/
				1586	#ifndef XXH_REROLL_XXH64
				1587	# if (defined(__ILP32__) \|\| defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
				1588	\|\| !(defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(_M_AMD64) /* x86-64 */ \
				1589	\|\| defined(_M_ARM64) \|\| defined(__aarch64__) \|\| defined(__arm64__) /* aarch64 */ \
				1590	\|\| defined(__PPC64__) \|\| defined(__PPC64LE__) \|\| defined(__ppc64__) \|\| defined(__powerpc64__) /* ppc64 */ \
				1591	\|\| defined(__mips64__) \|\| defined(__mips64)) /* mips64 */ \
				1592	\|\| (!defined(SIZE_MAX) \|\| SIZE_MAX < ULLONG_MAX) /* check limits */
				1593	# define XXH_REROLL_XXH64 1
				1594	# else
				1595	# define XXH_REROLL_XXH64 0
				1596	# endif
				1597	#endif /* !defined(XXH_REROLL_XXH64) */
				1598
				1599	#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
				1600	/*
				1601	* Manual byteshift. Best for old compilers which don't inline memcpy.
				1602	* We actually directly use XXH_readLE64 and XXH_readBE64.
				1603	*/
				1604	#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1605
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1606	/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
				1607	static xxh_u64 XXH_read64(const void* memPtr) { return (const xxh_u64) memPtr; }
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1608
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1609	#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1610
				1611	/*
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1612	* __pack instructions are safer, but compiler specific, hence potentially
				1613	* problematic for some compilers.
				1614	*
				1615	* Currently only defined for GCC and ICC.
				1616	*/
				1617	#ifdef XXH_OLD_NAMES
				1618	typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
				1619	#endif
				1620	static xxh_u64 XXH_read64(const void* ptr)
				1621	{
				1622	typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
				1623	return ((const xxh_unalign64*)ptr)->u64;
				1624	}
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1625
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1626	#else
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1627
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1628	/*
				1629	* Portable and safe solution. Generally efficient.
				1630	* see: https://stackoverflow.com/a/32095106/646947
				1631	*/
				1632	static xxh_u64 XXH_read64(const void* memPtr)
				1633	{
				1634	xxh_u64 val;
				1635	memcpy(&val, memPtr, sizeof(val));
				1636	return val;
				1637	}
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1638
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1639	#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1640
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1641	#if defined(_MSC_VER) /* Visual Studio */
				1642	# define XXH_swap64 _byteswap_uint64
				1643	#elif XXH_GCC_VERSION >= 403
				1644	# define XXH_swap64 __builtin_bswap64
				1645	#else
				1646	static xxh_u64 XXH_swap64 (xxh_u64 x)
				1647	{
				1648	return ((x << 56) & 0xff00000000000000ULL) \|
				1649	((x << 40) & 0x00ff000000000000ULL) \|
				1650	((x << 24) & 0x0000ff0000000000ULL) \|
				1651	((x << 8) & 0x000000ff00000000ULL) \|
				1652	((x >> 8) & 0x00000000ff000000ULL) \|
				1653	((x >> 24) & 0x0000000000ff0000ULL) \|
				1654	((x >> 40) & 0x000000000000ff00ULL) \|
				1655	((x >> 56) & 0x00000000000000ffULL);
				1656	}
				1657	#endif
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1658
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1659
				1660	/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
				1661	#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
				1662
				1663	XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
				1664	{
				1665	const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
				1666	return bytePtr[0]
				1667	\| ((xxh_u64)bytePtr[1] << 8)
				1668	\| ((xxh_u64)bytePtr[2] << 16)
				1669	\| ((xxh_u64)bytePtr[3] << 24)
				1670	\| ((xxh_u64)bytePtr[4] << 32)
				1671	\| ((xxh_u64)bytePtr[5] << 40)
				1672	\| ((xxh_u64)bytePtr[6] << 48)
				1673	\| ((xxh_u64)bytePtr[7] << 56);
				1674	}
				1675
				1676	XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
				1677	{
				1678	const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
				1679	return bytePtr[7]
				1680	\| ((xxh_u64)bytePtr[6] << 8)
				1681	\| ((xxh_u64)bytePtr[5] << 16)
				1682	\| ((xxh_u64)bytePtr[4] << 24)
				1683	\| ((xxh_u64)bytePtr[3] << 32)
				1684	\| ((xxh_u64)bytePtr[2] << 40)
				1685	\| ((xxh_u64)bytePtr[1] << 48)
				1686	\| ((xxh_u64)bytePtr[0] << 56);
				1687	}
				1688
				1689	#else
				1690	XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
				1691	{
				1692	return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
				1693	}
				1694
				1695	static xxh_u64 XXH_readBE64(const void* ptr)
				1696	{
				1697	return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
				1698	}
				1699	#endif
				1700
				1701	XXH_FORCE_INLINE xxh_u64
				1702	XXH_readLE64_align(const void* ptr, XXH_alignment align)
				1703	{
				1704	if (align==XXH_unaligned)
				1705	return XXH_readLE64(ptr);
				1706	else
				1707	return XXH_CPU_LITTLE_ENDIAN ? (const xxh_u64)ptr : XXH_swap64((const xxh_u64)ptr);
				1708	}
				1709
				1710
				1711	/***** xxh64 *****/
				1712
				1713	static const xxh_u64 XXH_PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
				1714	static const xxh_u64 XXH_PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
				1715	static const xxh_u64 XXH_PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
				1716	static const xxh_u64 XXH_PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
				1717	static const xxh_u64 XXH_PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
				1718
				1719	#ifdef XXH_OLD_NAMES
				1720	# define PRIME64_1 XXH_PRIME64_1
				1721	# define PRIME64_2 XXH_PRIME64_2
				1722	# define PRIME64_3 XXH_PRIME64_3
				1723	# define PRIME64_4 XXH_PRIME64_4
				1724	# define PRIME64_5 XXH_PRIME64_5
				1725	#endif
				1726
				1727	static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
				1728	{
				1729	acc += input * XXH_PRIME64_2;
				1730	acc = XXH_rotl64(acc, 31);
				1731	acc *= XXH_PRIME64_1;
				1732	return acc;
				1733	}
				1734
				1735	static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
				1736	{
				1737	val = XXH64_round(0, val);
				1738	acc ^= val;
				1739	acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
				1740	return acc;
				1741	}
				1742
				1743	static xxh_u64 XXH64_avalanche(xxh_u64 h64)
				1744	{
				1745	h64 ^= h64 >> 33;
				1746	h64 *= XXH_PRIME64_2;
				1747	h64 ^= h64 >> 29;
				1748	h64 *= XXH_PRIME64_3;
				1749	h64 ^= h64 >> 32;
				1750	return h64;
				1751	}
				1752
				1753
				1754	#define XXH_get64bits(p) XXH_readLE64_align(p, align)
				1755
				1756	static xxh_u64
				1757	XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
				1758	{
				1759	#define XXH_PROCESS1_64 do { \
				1760	h64 ^= (ptr++) XXH_PRIME64_5; \
				1761	h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \
				1762	} while (0)
				1763
				1764	#define XXH_PROCESS4_64 do { \
				1765	h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; \
				1766	ptr += 4; \
				1767	h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \
				1768	} while (0)
				1769
				1770	#define XXH_PROCESS8_64 do { \
				1771	xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
				1772	ptr += 8; \
				1773	h64 ^= k1; \
				1774	h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4; \
				1775	} while (0)
				1776
				1777	/* Rerolled version for 32-bit targets is faster and much smaller. */
				1778	if (XXH_REROLL \|\| XXH_REROLL_XXH64) {
				1779	len &= 31;
				1780	while (len >= 8) {
				1781	XXH_PROCESS8_64;
				1782	len -= 8;
				1783	}
				1784	if (len >= 4) {
				1785	XXH_PROCESS4_64;
				1786	len -= 4;
				1787	}
				1788	while (len > 0) {
				1789	XXH_PROCESS1_64;
				1790	--len;
				1791	}
				1792	return XXH64_avalanche(h64);
				1793	} else {
				1794	switch(len & 31) {
				1795	case 24: XXH_PROCESS8_64;
				1796	/* fallthrough */
				1797	case 16: XXH_PROCESS8_64;
				1798	/* fallthrough */
				1799	case 8: XXH_PROCESS8_64;
				1800	return XXH64_avalanche(h64);
				1801
				1802	case 28: XXH_PROCESS8_64;
				1803	/* fallthrough */
				1804	case 20: XXH_PROCESS8_64;
				1805	/* fallthrough */
				1806	case 12: XXH_PROCESS8_64;
				1807	/* fallthrough */
				1808	case 4: XXH_PROCESS4_64;
				1809	return XXH64_avalanche(h64);
				1810
				1811	case 25: XXH_PROCESS8_64;
				1812	/* fallthrough */
				1813	case 17: XXH_PROCESS8_64;
				1814	/* fallthrough */
				1815	case 9: XXH_PROCESS8_64;
				1816	XXH_PROCESS1_64;
				1817	return XXH64_avalanche(h64);
				1818
				1819	case 29: XXH_PROCESS8_64;
				1820	/* fallthrough */
				1821	case 21: XXH_PROCESS8_64;
				1822	/* fallthrough */
				1823	case 13: XXH_PROCESS8_64;
				1824	/* fallthrough */
				1825	case 5: XXH_PROCESS4_64;
				1826	XXH_PROCESS1_64;
				1827	return XXH64_avalanche(h64);
				1828
				1829	case 26: XXH_PROCESS8_64;
				1830	/* fallthrough */
				1831	case 18: XXH_PROCESS8_64;
				1832	/* fallthrough */
				1833	case 10: XXH_PROCESS8_64;
				1834	XXH_PROCESS1_64;
				1835	XXH_PROCESS1_64;
				1836	return XXH64_avalanche(h64);
				1837
				1838	case 30: XXH_PROCESS8_64;
				1839	/* fallthrough */
				1840	case 22: XXH_PROCESS8_64;
				1841	/* fallthrough */
				1842	case 14: XXH_PROCESS8_64;
				1843	/* fallthrough */
				1844	case 6: XXH_PROCESS4_64;
				1845	XXH_PROCESS1_64;
				1846	XXH_PROCESS1_64;
				1847	return XXH64_avalanche(h64);
				1848
				1849	case 27: XXH_PROCESS8_64;
				1850	/* fallthrough */
				1851	case 19: XXH_PROCESS8_64;
				1852	/* fallthrough */
				1853	case 11: XXH_PROCESS8_64;
				1854	XXH_PROCESS1_64;
				1855	XXH_PROCESS1_64;
				1856	XXH_PROCESS1_64;
				1857	return XXH64_avalanche(h64);
				1858
				1859	case 31: XXH_PROCESS8_64;
				1860	/* fallthrough */
				1861	case 23: XXH_PROCESS8_64;
				1862	/* fallthrough */
				1863	case 15: XXH_PROCESS8_64;
				1864	/* fallthrough */
				1865	case 7: XXH_PROCESS4_64;
				1866	/* fallthrough */
				1867	case 3: XXH_PROCESS1_64;
				1868	/* fallthrough */
				1869	case 2: XXH_PROCESS1_64;
				1870	/* fallthrough */
				1871	case 1: XXH_PROCESS1_64;
				1872	/* fallthrough */
				1873	case 0: return XXH64_avalanche(h64);
				1874	}
				1875	}
				1876	/* impossible to reach */
				1877	XXH_ASSERT(0);
				1878	return 0; /* unreachable, but some compilers complain without it */
				1879	}
				1880
				1881	#ifdef XXH_OLD_NAMES
				1882	# define PROCESS1_64 XXH_PROCESS1_64
				1883	# define PROCESS4_64 XXH_PROCESS4_64
				1884	# define PROCESS8_64 XXH_PROCESS8_64
				1885	#else
				1886	# undef XXH_PROCESS1_64
				1887	# undef XXH_PROCESS4_64
				1888	# undef XXH_PROCESS8_64
				1889	#endif
				1890
				1891	XXH_FORCE_INLINE xxh_u64
				1892	XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
				1893	{
				1894	const xxh_u8* bEnd = input + len;
				1895	xxh_u64 h64;
				1896
				1897	#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
				1898	if (input==NULL) {
				1899	len=0;
				1900	bEnd=input=(const xxh_u8*)(size_t)32;
				1901	}
				1902	#endif
				1903
				1904	if (len>=32) {
				1905	const xxh_u8* const limit = bEnd - 32;
				1906	xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
				1907	xxh_u64 v2 = seed + XXH_PRIME64_2;
				1908	xxh_u64 v3 = seed + 0;
				1909	xxh_u64 v4 = seed - XXH_PRIME64_1;
				1910
				1911	do {
				1912	v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
				1913	v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
				1914	v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
				1915	v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
				1916	} while (input<=limit);
				1917
				1918	h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
				1919	h64 = XXH64_mergeRound(h64, v1);
				1920	h64 = XXH64_mergeRound(h64, v2);
				1921	h64 = XXH64_mergeRound(h64, v3);
				1922	h64 = XXH64_mergeRound(h64, v4);
				1923
				1924	} else {
				1925	h64 = seed + XXH_PRIME64_5;
				1926	}
				1927
				1928	h64 += (xxh_u64) len;
				1929
				1930	return XXH64_finalize(h64, input, len, align);
				1931	}
				1932
				1933
				1934	XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
				1935	{
				1936	#if 0
				1937	/* Simple version, good for code maintenance, but unfortunately slow for small inputs */
				1938	XXH64_state_t state;
				1939	XXH64_reset(&state, seed);
				1940	XXH64_update(&state, (const xxh_u8*)input, len);
				1941	return XXH64_digest(&state);
				1942
				1943	#else
				1944
				1945	if (XXH_FORCE_ALIGN_CHECK) {
				1946	if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */
				1947	return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
				1948	} }
				1949
				1950	return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
				1951
				1952	#endif
				1953	}
				1954
				1955	/***** Hash Streaming *****/
				1956
				1957	XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
				1958	{
				1959	return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
				1960	}
				1961	XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
				1962	{
				1963	XXH_free(statePtr);
				1964	return XXH_OK;
				1965	}
				1966
				1967	XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
				1968	{
				1969	memcpy(dstState, srcState, sizeof(*dstState));
				1970	}
				1971
				1972	XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
				1973	{
				1974	XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
				1975	memset(&state, 0, sizeof(state));
				1976	state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
				1977	state.v2 = seed + XXH_PRIME64_2;
				1978	state.v3 = seed + 0;
				1979	state.v4 = seed - XXH_PRIME64_1;
				1980	/* do not write into reserved64, might be removed in a future version */
				1981	memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
				1982	return XXH_OK;
				1983	}
				1984
				1985	XXH_PUBLIC_API XXH_errorcode
				1986	XXH64_update (XXH64_state_t* state, const void* input, size_t len)
				1987	{
				1988	if (input==NULL)
				1989	#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
				1990	return XXH_OK;
				1991	#else
				1992	return XXH_ERROR;
				1993	#endif
				1994
				1995	{ const xxh_u8* p = (const xxh_u8*)input;
				1996	const xxh_u8* const bEnd = p + len;
				1997
				1998	state->total_len += len;
				1999
				2000	if (state->memsize + len < 32) { /* fill in tmp buffer */
				2001	XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
				2002	state->memsize += (xxh_u32)len;
				2003	return XXH_OK;
				2004	}
				2005
				2006	if (state->memsize) { /* tmp buffer is full */
				2007	XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
				2008	state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
				2009	state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
				2010	state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
				2011	state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
				2012	p += 32-state->memsize;
				2013	state->memsize = 0;
				2014	}
				2015
				2016	if (p+32 <= bEnd) {
				2017	const xxh_u8* const limit = bEnd - 32;
				2018	xxh_u64 v1 = state->v1;
				2019	xxh_u64 v2 = state->v2;
				2020	xxh_u64 v3 = state->v3;
				2021	xxh_u64 v4 = state->v4;
				2022
				2023	do {
				2024	v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
				2025	v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
				2026	v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
				2027	v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
				2028	} while (p<=limit);
				2029
				2030	state->v1 = v1;
				2031	state->v2 = v2;
				2032	state->v3 = v3;
				2033	state->v4 = v4;
				2034	}
				2035
				2036	if (p < bEnd) {
				2037	XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
				2038	state->memsize = (unsigned)(bEnd-p);
				2039	}
				2040	}
				2041
				2042	return XXH_OK;
				2043	}
				2044
				2045
				2046	XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
				2047	{
				2048	xxh_u64 h64;
				2049
				2050	if (state->total_len >= 32) {
				2051	xxh_u64 const v1 = state->v1;
				2052	xxh_u64 const v2 = state->v2;
				2053	xxh_u64 const v3 = state->v3;
				2054	xxh_u64 const v4 = state->v4;
				2055
				2056	h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
				2057	h64 = XXH64_mergeRound(h64, v1);
				2058	h64 = XXH64_mergeRound(h64, v2);
				2059	h64 = XXH64_mergeRound(h64, v3);
				2060	h64 = XXH64_mergeRound(h64, v4);
				2061	} else {
				2062	h64 = state->v3 /seed/ + XXH_PRIME64_5;
				2063	}
				2064
				2065	h64 += (xxh_u64) state->total_len;
				2066
				2067	return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
				2068	}
				2069
				2070
				2071	/***** Canonical representation *****/
				2072
				2073	XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
				2074	{
				2075	XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
				2076	if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
				2077	memcpy(dst, &hash, sizeof(*dst));
				2078	}
				2079
				2080	XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
				2081	{
				2082	return XXH_readBE64(src);
				2083	}
				2084
				2085
				2086
				2087	/* *********************************************************************
				2088	* XXH3
				2089	* New generation hash designed for speed on small keys and vectorization
				2090	************************************************************************ */
				2091
				2092	/* === Compiler specifics === */
				2093
				2094	#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
				2095	# define XXH_RESTRICT restrict
				2096	#else
				2097	/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
				2098	# define XXH_RESTRICT /* disable */
				2099	#endif
				2100
				2101	#if (defined(__GNUC__) && (__GNUC__ >= 3)) \
				2102	\|\| (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
				2103	\|\| defined(__clang__)
				2104	# define XXH_likely(x) __builtin_expect(x, 1)
				2105	# define XXH_unlikely(x) __builtin_expect(x, 0)
				2106	#else
				2107	# define XXH_likely(x) (x)
				2108	# define XXH_unlikely(x) (x)
				2109	#endif
				2110
				2111	#if defined(__GNUC__)
				2112	# if defined(__AVX2__)
				2113	# include <immintrin.h>
				2114	# elif defined(__SSE2__)
				2115	# include <emmintrin.h>
				2116	# elif defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
				2117	# define inline __inline__ /* circumvent a clang bug */
				2118	# include <arm_neon.h>
				2119	# undef inline
				2120	# endif
				2121	#elif defined(_MSC_VER)
				2122	# include <intrin.h>
				2123	#endif
				2124
				2125	/*
				2126	* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
				2127	* remaining a true 64-bit/128-bit hash function.
				2128	*
				2129	* This is done by prioritizing a subset of 64-bit operations that can be
				2130	* emulated without too many steps on the average 32-bit machine.
				2131	*
				2132	* For example, these two lines seem similar, and run equally fast on 64-bit:
				2133	*
				2134	* xxh_u64 x;
				2135	* x ^= (x >> 47); // good
				2136	* x ^= (x >> 13); // bad
				2137	*
				2138	* However, to a 32-bit machine, there is a major difference.
				2139	*
				2140	* x ^= (x >> 47) looks like this:
				2141	*
				2142	* x.lo ^= (x.hi >> (47 - 32));
				2143	*
				2144	* while x ^= (x >> 13) looks like this:
				2145	*
				2146	* // note: funnel shifts are not usually cheap.
				2147	* x.lo ^= (x.lo >> 13) \| (x.hi << (32 - 13));
				2148	* x.hi ^= (x.hi >> 13);
				2149	*
				2150	* The first one is significantly faster than the second, simply because the
				2151	* shift is larger than 32. This means:
				2152	* - All the bits we need are in the upper 32 bits, so we can ignore the lower
				2153	* 32 bits in the shift.
				2154	* - The shift result will always fit in the lower 32 bits, and therefore,
				2155	* we can ignore the upper 32 bits in the xor.
				2156	*
				2157	* Thanks to this optimization, XXH3 only requires these features to be efficient:
				2158	*
				2159	* - Usable unaligned access
				2160	* - A 32-bit or 64-bit ALU
				2161	* - If 32-bit, a decent ADC instruction
				2162	* - A 32 or 64-bit multiply with a 64-bit result
				2163	* - For the 128-bit variant, a decent byteswap helps short inputs.
				2164	*
				2165	* The first two are already required by XXH32, and almost all 32-bit and 64-bit
				2166	* platforms which can run XXH32 can run XXH3 efficiently.
				2167	*
				2168	* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
				2169	* notable exception.
				2170	*
				2171	* First of all, Thumb-1 lacks support for the UMULL instruction which
				2172	* performs the important long multiply. This means numerous __aeabi_lmul
				2173	* calls.
				2174	*
				2175	* Second of all, the 8 functional registers are just not enough.
				2176	* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
				2177	* Lo registers, and this shuffling results in thousands more MOVs than A32.
				2178	*
				2179	* A32 and T32 don't have this limitation. They can access all 14 registers,
				2180	* do a 32->64 multiply with UMULL, and the flexible operand allowing free
				2181	* shifts is helpful, too.
				2182	*
				2183	* Therefore, we do a quick sanity check.
				2184	*
				2185	* If compiling Thumb-1 for a target which supports ARM instructions, we will
				2186	* emit a warning, as it is not a "sane" platform to compile for.
				2187	*
				2188	* Usually, if this happens, it is because of an accident and you probably need
				2189	* to specify -march, as you likely meant to compile for a newer architecture.
				2190	*
				2191	* Credit: large sections of the vectorial and asm source code paths
				2192	* have been contributed by @easyaspi314
				2193	*/
				2194	#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
				2195	# warning "XXH3 is highly inefficient without ARM or Thumb-2."
				2196	#endif
				2197
				2198	/* ==========================================
				2199	* Vectorization detection
				2200	* ========================================== */
				2201	#define XXH_SCALAR 0 /* Portable scalar version */
				2202	#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */
				2203	#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */
				2204	#define XXH_AVX512 3 /* AVX512 for Skylake and Icelake */
				2205	#define XXH_NEON 4 /* NEON for most ARMv7-A and all AArch64 */
				2206	#define XXH_VSX 5 /* VSX and ZVector for POWER8/z13 */
				2207
				2208	#ifndef XXH_VECTOR /* can be defined on command line */
				2209	# if defined(__AVX512F__)
				2210	# define XXH_VECTOR XXH_AVX512
				2211	# elif defined(__AVX2__)
				2212	# define XXH_VECTOR XXH_AVX2
				2213	# elif defined(__SSE2__) \|\| defined(_M_AMD64) \|\| defined(_M_X64) \|\| (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
				2214	# define XXH_VECTOR XXH_SSE2
				2215	# elif defined(__GNUC__) /* msvc support maybe later */ \
				2216	&& (defined(__ARM_NEON__) \|\| defined(__ARM_NEON)) \
				2217	&& (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
				2218	\|\| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
				2219	# define XXH_VECTOR XXH_NEON
				2220	# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
				2221	\|\| (defined(__s390x__) && defined(__VEC__)) \
				2222	&& defined(__GNUC__) /* TODO: IBM XL */
				2223	# define XXH_VECTOR XXH_VSX
				2224	# else
				2225	# define XXH_VECTOR XXH_SCALAR
				2226	# endif
				2227	#endif
				2228
				2229	/*
				2230	* Controls the alignment of the accumulator,
				2231	* for compatibility with aligned vector loads, which are usually faster.
				2232	*/
				2233	#ifndef XXH_ACC_ALIGN
				2234	# if defined(XXH_X86DISPATCH)
				2235	# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */
				2236	# elif XXH_VECTOR == XXH_SCALAR /* scalar */
				2237	# define XXH_ACC_ALIGN 8
				2238	# elif XXH_VECTOR == XXH_SSE2 /* sse2 */
				2239	# define XXH_ACC_ALIGN 16
				2240	# elif XXH_VECTOR == XXH_AVX2 /* avx2 */
				2241	# define XXH_ACC_ALIGN 32
				2242	# elif XXH_VECTOR == XXH_NEON /* neon */
				2243	# define XXH_ACC_ALIGN 16
				2244	# elif XXH_VECTOR == XXH_VSX /* vsx */
				2245	# define XXH_ACC_ALIGN 16
				2246	# elif XXH_VECTOR == XXH_AVX512 /* avx512 */
				2247	# define XXH_ACC_ALIGN 64
				2248	# endif
				2249	#endif
				2250
				2251	#if defined(XXH_X86DISPATCH) \|\| XXH_VECTOR == XXH_SSE2 \
				2252	\|\| XXH_VECTOR == XXH_AVX2 \|\| XXH_VECTOR == XXH_AVX512
				2253	# define XXH_SEC_ALIGN XXH_ACC_ALIGN
				2254	#else
				2255	# define XXH_SEC_ALIGN 8
				2256	#endif
				2257
				2258	/*
				2259	* UGLY HACK:
				2260	* GCC usually generates the best code with -O3 for xxHash.
				2261	*
				2262	* However, when targeting AVX2, it is overzealous in its unrolling resulting
				2263	* in code roughly 3/4 the speed of Clang.
				2264	*
				2265	* There are other issues, such as GCC splitting _mm256_loadu_si256 into
				2266	* _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
				2267	* only applies to Sandy and Ivy Bridge... which don't even support AVX2.
				2268	*
				2269	* That is why when compiling the AVX2 version, it is recommended to use either
				2270	* -O2 -mavx2 -march=haswell
				2271	* or
				2272	* -O2 -mavx2 -mno-avx256-split-unaligned-load
				2273	* for decent performance, or to use Clang instead.
				2274	*
				2275	* Fortunately, we can control the first one with a pragma that forces GCC into
				2276	* -O2, but the other one we can't control without "failed to inline always
				2277	* inline function due to target mismatch" warnings.
				2278	*/
				2279	#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
				2280	&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
				2281	&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
				2282	# pragma GCC push_options
				2283	# pragma GCC optimize("-O2")
				2284	#endif
				2285
				2286
				2287	#if XXH_VECTOR == XXH_NEON
				2288	/*
				2289	* NEON's setup for vmlal_u32 is a little more complicated than it is on
				2290	* SSE2, AVX2, and VSX.
				2291	*
				2292	* While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
				2293	*
				2294	* To do the same operation, the 128-bit 'Q' register needs to be split into
				2295	* two 64-bit 'D' registers, performing this operation::
				2296	*
				2297	* [ a \| b ]
				2298	* \| '---------. .--------' \|
				2299	* \| x \|
				2300	* \| .---------' '--------. \|
				2301	* [ a & 0xFFFFFFFF \| b & 0xFFFFFFFF ],[ a >> 32 \| b >> 32 ]
				2302	*
				2303	* Due to significant changes in aarch64, the fastest method for aarch64 is
				2304	* completely different than the fastest method for ARMv7-A.
				2305	*
				2306	* ARMv7-A treats D registers as unions overlaying Q registers, so modifying
				2307	* D11 will modify the high half of Q5. This is similar to how modifying AH
				2308	* will only affect bits 8-15 of AX on x86.
				2309	*
				2310	* VZIP takes two registers, and puts even lanes in one register and odd lanes
				2311	* in the other.
				2312	*
				2313	* On ARMv7-A, this strangely modifies both parameters in place instead of
				2314	* taking the usual 3-operand form.
				2315	*
				2316	* Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
				2317	* lower and upper halves of the Q register to end up with the high and low
				2318	* halves where we want - all in one instruction.
				2319	*
				2320	* vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
				2321	*
				2322	* Unfortunately we need inline assembly for this: Instructions modifying two
				2323	* registers at once is not possible in GCC or Clang's IR, and they have to
				2324	* create a copy.
				2325	*
				2326	* aarch64 requires a different approach.
				2327	*
				2328	* In order to make it easier to write a decent compiler for aarch64, many
				2329	* quirks were removed, such as conditional execution.
				2330	*
				2331	* NEON was also affected by this.
				2332	*
				2333	* aarch64 cannot access the high bits of a Q-form register, and writes to a
				2334	* D-form register zero the high bits, similar to how writes to W-form scalar
				2335	* registers (or DWORD registers on x86_64) work.
				2336	*
				2337	* The formerly free vget_high intrinsics now require a vext (with a few
				2338	* exceptions)
				2339	*
				2340	* Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
				2341	* of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
				2342	* operand.
				2343	*
				2344	* The equivalent of the VZIP.32 on the lower and upper halves would be this
				2345	* mess:
				2346	*
				2347	* ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
				2348	* zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
				2349	* zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
				2350	*
				2351	* Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
				2352	*
				2353	* shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
				2354	* xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
				2355	*
				2356	* This is available on ARMv7-A, but is less efficient than a single VZIP.32.
				2357	*/
				2358
				2359	/*
				2360	* Function-like macro:
				2361	* void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
				2362	* {
				2363	* outLo = (uint32x2_t)(in & 0xFFFFFFFF);
				2364	* outHi = (uint32x2_t)(in >> 32);
				2365	* in = UNDEFINED;
				2366	* }
				2367	*/
				2368	# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
				2369	&& defined(__GNUC__) \
				2370	&& !defined(__aarch64__) && !defined(__arm64__)
				2371	# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
				2372	do { \
				2373	/* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
				2374	/* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
				2375	/* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
				2376	__asm__("vzip.32 %e0, %f0" : "+w" (in)); \
				2377	(outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \
				2378	(outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
				2379	} while (0)
				2380	# else
				2381	# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
				2382	do { \
				2383	(outLo) = vmovn_u64 (in); \
				2384	(outHi) = vshrn_n_u64 ((in), 32); \
				2385	} while (0)
				2386	# endif
				2387	#endif /* XXH_VECTOR == XXH_NEON */
				2388
				2389	/*
				2390	* VSX and Z Vector helpers.
				2391	*
				2392	* This is very messy, and any pull requests to clean this up are welcome.
				2393	*
				2394	* There are a lot of problems with supporting VSX and s390x, due to
				2395	* inconsistent intrinsics, spotty coverage, and multiple endiannesses.
				2396	*/
				2397	#if XXH_VECTOR == XXH_VSX
				2398	# if defined(__s390x__)
				2399	# include <s390intrin.h>
				2400	# else
				2401	/* gcc's altivec.h can have the unwanted consequence to unconditionally
				2402	* #define bool, vector, and pixel keywords,
				2403	* with bad consequences for programs already using these keywords for other purposes.
				2404	* The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.
				2405	* __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,
				2406	* but it seems that, in some cases, it isn't.
				2407	* Force the build macro to be defined, so that keywords are not altered.
				2408	*/
				2409	# if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
				2410	# define __APPLE_ALTIVEC__
				2411	# endif
				2412	# include <altivec.h>
				2413	# endif
				2414
				2415	typedef __vector unsigned long long xxh_u64x2;
				2416	typedef __vector unsigned char xxh_u8x16;
				2417	typedef __vector unsigned xxh_u32x4;
				2418
				2419	# ifndef XXH_VSX_BE
				2420	# if defined(__BIG_ENDIAN__) \
				2421	\|\| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
				2422	# define XXH_VSX_BE 1
				2423	# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
				2424	# warning "-maltivec=be is not recommended. Please use native endianness."
				2425	# define XXH_VSX_BE 1
				2426	# else
				2427	# define XXH_VSX_BE 0
				2428	# endif
				2429	# endif /* !defined(XXH_VSX_BE) */
				2430
				2431	# if XXH_VSX_BE
				2432	/* A wrapper for POWER9's vec_revb. */
				2433	# if defined(__POWER9_VECTOR__) \|\| (defined(__clang__) && defined(__s390x__))
				2434	# define XXH_vec_revb vec_revb
				2435	# else
				2436	XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
				2437	{
				2438	xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
				2439	0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
				2440	return vec_perm(val, val, vByteSwap);
				2441	}
				2442	# endif
				2443	# endif /* XXH_VSX_BE */
				2444
				2445	/*
				2446	* Performs an unaligned load and byte swaps it on big endian.
				2447	*/
				2448	XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
				2449	{
				2450	xxh_u64x2 ret;
				2451	memcpy(&ret, ptr, sizeof(xxh_u64x2));
				2452	# if XXH_VSX_BE
				2453	ret = XXH_vec_revb(ret);
				2454	# endif
				2455	return ret;
				2456	}
				2457
				2458	/*
				2459	* vec_mulo and vec_mule are very problematic intrinsics on PowerPC
				2460	*
				2461	* These intrinsics weren't added until GCC 8, despite existing for a while,
				2462	* and they are endian dependent. Also, their meaning swap depending on version.
				2463	* */
				2464	# if defined(__s390x__)
				2465	/* s390x is always big endian, no issue on this platform */
				2466	# define XXH_vec_mulo vec_mulo
				2467	# define XXH_vec_mule vec_mule
				2468	# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
				2469	/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
				2470	# define XXH_vec_mulo __builtin_altivec_vmulouw
				2471	# define XXH_vec_mule __builtin_altivec_vmuleuw
				2472	# else
				2473	/* gcc needs inline assembly */
				2474	/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
				2475	XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
				2476	{
				2477	xxh_u64x2 result;
				2478	__asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
				2479	return result;
				2480	}
				2481	XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
				2482	{
				2483	xxh_u64x2 result;
				2484	__asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
				2485	return result;
				2486	}
				2487	# endif /* XXH_vec_mulo, XXH_vec_mule */
				2488	#endif /* XXH_VECTOR == XXH_VSX */
				2489
				2490
				2491	/* prefetch
				2492	* can be disabled, by declaring XXH_NO_PREFETCH build macro */
				2493	#if defined(XXH_NO_PREFETCH)
				2494	# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
				2495	#else
				2496	# if defined(_MSC_VER) && (defined(_M_X64) \|\| defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
				2497	# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
				2498	# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
				2499	# elif defined(__GNUC__) && ( (__GNUC__ >= 4) \|\| ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
				2500	# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read /, 3 / locality */)
				2501	# else
				2502	# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
				2503	# endif
				2504	#endif /* XXH_NO_PREFETCH */
				2505
				2506
				2507	/* ==========================================
				2508	* XXH3 default settings
				2509	* ========================================== */
				2510
				2511	#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
				2512
				2513	#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
				2514	# error "default keyset is not large enough"
				2515	#endif
				2516
				2517	/* Pseudorandom secret taken directly from FARSH */
				2518	XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
				2519	0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
				2520	0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
				2521	0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
				2522	0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
				2523	0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
				2524	0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
				2525	0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
				2526	0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
				2527	0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
				2528	0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
				2529	0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
				2530	0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
				2531	};
				2532
				2533
				2534	#ifdef XXH_OLD_NAMES
				2535	# define kSecret XXH3_kSecret
				2536	#endif
				2537
				2538	/*
				2539	* Calculates a 32-bit to 64-bit long multiply.
				2540	*
				2541	* Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
				2542	* need to (but it shouldn't need to anyways, it is about 7 instructions to do
				2543	* a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
				2544	* use that instead of the normal method.
				2545	*
				2546	* If you are compiling for platforms like Thumb-1 and don't have a better option,
				2547	* you may also want to write your own long multiply routine here.
				2548	*
				2549	* XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
				2550	* {
				2551	* return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
				2552	* }
				2553	*/
				2554	#if defined(_MSC_VER) && defined(_M_IX86)
				2555	# include <intrin.h>
				2556	# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
				2557	#else
				2558	/*
				2559	* Downcast + upcast is usually better than masking on older compilers like
				2560	* GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
				2561	*
				2562	* The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
				2563	* and perform a full 64x64 multiply -- entirely redundant on 32-bit.
				2564	*/
				2565	# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
				2566	#endif
				2567
				2568	/*
				2569	* Calculates a 64->128-bit long multiply.
				2570	*
				2571	* Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
				2572	*/
				2573	static XXH128_hash_t
				2574	XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
				2575	{
				2576	/*
				2577	* GCC/Clang __uint128_t method.
				2578	*
				2579	* On most 64-bit targets, GCC and Clang define a __uint128_t type.
				2580	* This is usually the best way as it usually uses a native long 64-bit
				2581	* multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
				2582	*
				2583	* Usually.
				2584	*
				2585	* Despite being a 32-bit platform, Clang (and emscripten) define this type
				2586	* despite not having the arithmetic for it. This results in a laggy
				2587	* compiler builtin call which calculates a full 128-bit multiply.
				2588	* In that case it is best to use the portable one.
				2589	* https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
				2590	*/
				2591	#if defined(__GNUC__) && !defined(__wasm__) \
				2592	&& defined(__SIZEOF_INT128__) \
				2593	\|\| (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
				2594
				2595	__uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
				2596	XXH128_hash_t r128;
				2597	r128.low64 = (xxh_u64)(product);
				2598	r128.high64 = (xxh_u64)(product >> 64);
				2599	return r128;
				2600
				2601	/*
				2602	* MSVC for x64's _umul128 method.
				2603	*
				2604	* xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
				2605	*
				2606	* This compiles to single operand MUL on x64.
				2607	*/
				2608	#elif defined(_M_X64) \|\| defined(_M_IA64)
				2609
				2610	#ifndef _MSC_VER
				2611	# pragma intrinsic(_umul128)
				2612	#endif
				2613	xxh_u64 product_high;
				2614	xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
				2615	XXH128_hash_t r128;
				2616	r128.low64 = product_low;
				2617	r128.high64 = product_high;
				2618	return r128;
				2619
				2620	#else
				2621	/*
				2622	* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
				2623	*
				2624	* This is a fast and simple grade school multiply, which is shown below
				2625	* with base 10 arithmetic instead of base 0x100000000.
				2626	*
				2627	* 9 3 // D2 lhs = 93
				2628	* x 7 5 // D2 rhs = 75
				2629	* ----------
				2630	* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
				2631	* 4 5 \| // D2 hi_lo = (93 / 10) * (75 % 10) = 45
				2632	* 2 1 \| // D2 lo_hi = (93 % 10) * (75 / 10) = 21
				2633	* + 6 3 \| \| // D2 hi_hi = (93 / 10) * (75 / 10) = 63
				2634	* ---------
				2635	* 2 7 \| // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
				2636	* + 6 7 \| \| // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
				2637	* ---------
				2638	* 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
				2639	*
				2640	* The reasons for adding the products like this are:
				2641	* 1. It avoids manual carry tracking. Just like how
				2642	* (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
				2643	* This avoids a lot of complexity.
				2644	*
				2645	* 2. It hints for, and on Clang, compiles to, the powerful UMAAL
				2646	* instruction available in ARM's Digital Signal Processing extension
				2647	* in 32-bit ARMv6 and later, which is shown below:
				2648	*
				2649	* void UMAAL(xxh_u32 RdLo, xxh_u32 RdHi, xxh_u32 Rn, xxh_u32 Rm)
				2650	* {
				2651	* xxh_u64 product = (xxh_u64)RdLo (xxh_u64)*RdHi + Rn + Rm;
				2652	* *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
				2653	* *RdHi = (xxh_u32)(product >> 32);
				2654	* }
				2655	*
				2656	* This instruction was designed for efficient long multiplication, and
				2657	* allows this to be calculated in only 4 instructions at speeds
				2658	* comparable to some 64-bit ALUs.
				2659	*
				2660	* 3. It isn't terrible on other platforms. Usually this will be a couple
				2661	* of 32-bit ADD/ADCs.
				2662	*/
				2663
				2664	/* First calculate all of the cross products. */
				2665	xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
				2666	xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
				2667	xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
				2668	xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
				2669
				2670	/* Now add the products together. These will never overflow. */
				2671	xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
				2672	xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
				2673	xxh_u64 const lower = (cross << 32) \| (lo_lo & 0xFFFFFFFF);
				2674
				2675	XXH128_hash_t r128;
				2676	r128.low64 = lower;
				2677	r128.high64 = upper;
				2678	return r128;
				2679	#endif
				2680	}
				2681
				2682	/*
				2683	* Does a 64-bit to 128-bit multiply, then XOR folds it.
				2684	*
				2685	* The reason for the separate function is to prevent passing too many structs
				2686	* around by value. This will hopefully inline the multiply, but we don't force it.
				2687	*/
				2688	static xxh_u64
				2689	XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
				2690	{
				2691	XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
				2692	return product.low64 ^ product.high64;
				2693	}
				2694
				2695	/* Seems to produce slightly better code on GCC for some reason. */
				2696	XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
				2697	{
				2698	XXH_ASSERT(0 <= shift && shift < 64);
				2699	return v64 ^ (v64 >> shift);
				2700	}
				2701
				2702	/*
				2703	* This is a fast avalanche stage,
				2704	* suitable when input bits are already partially mixed
				2705	*/
				2706	static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
				2707	{
				2708	h64 = XXH_xorshift64(h64, 37);
				2709	h64 *= 0x165667919E3779F9ULL;
				2710	h64 = XXH_xorshift64(h64, 32);
				2711	return h64;
				2712	}
				2713
				2714	/*
				2715	* This is a stronger avalanche,
				2716	* inspired by Pelle Evensen's rrmxmx
				2717	* preferable when input has not been previously mixed
				2718	*/
				2719	static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
				2720	{
				2721	/* this mix is inspired by Pelle Evensen's rrmxmx */
				2722	h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
				2723	h64 *= 0x9FB21C651E98DF25ULL;
				2724	h64 ^= (h64 >> 35) + len ;
				2725	h64 *= 0x9FB21C651E98DF25ULL;
				2726	return XXH_xorshift64(h64, 28);
				2727	}
				2728
				2729
				2730	/* ==========================================
				2731	* Short keys
				2732	* ==========================================
				2733	* One of the shortcomings of XXH32 and XXH64 was that their performance was
				2734	* sub-optimal on short lengths. It used an iterative algorithm which strongly
				2735	* favored lengths that were a multiple of 4 or 8.
				2736	*
				2737	* Instead of iterating over individual inputs, we use a set of single shot
				2738	* functions which piece together a range of lengths and operate in constant time.
				2739	*
				2740	* Additionally, the number of multiplies has been significantly reduced. This
				2741	* reduces latency, especially when emulating 64-bit multiplies on 32-bit.
				2742	*
				2743	* Depending on the platform, this may or may not be faster than XXH32, but it
				2744	* is almost guaranteed to be faster than XXH64.
				2745	*/
				2746
				2747	/*
				2748	* At very short lengths, there isn't enough input to fully hide secrets, or use
				2749	* the entire secret.
				2750	*
				2751	* There is also only a limited amount of mixing we can do before significantly
				2752	* impacting performance.
				2753	*
				2754	* Therefore, we use different sections of the secret and always mix two secret
				2755	* samples with an XOR. This should have no effect on performance on the
				2756	* seedless or withSeed variants because everything _should_ be constant folded
				2757	* by modern compilers.
				2758	*
				2759	* The XOR mixing hides individual parts of the secret and increases entropy.
				2760	*
				2761	* This adds an extra layer of strength for custom secrets.
				2762	*/
				2763	XXH_FORCE_INLINE XXH64_hash_t
				2764	XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				2765	{
				2766	XXH_ASSERT(input != NULL);
				2767	XXH_ASSERT(1 <= len && len <= 3);
				2768	XXH_ASSERT(secret != NULL);
				2769	/*
				2770	* len = 1: combined = { input[0], 0x01, input[0], input[0] }
				2771	* len = 2: combined = { input[1], 0x02, input[0], input[1] }
				2772	* len = 3: combined = { input[2], 0x03, input[0], input[1] }
				2773	*/
				2774	{ xxh_u8 const c1 = input[0];
				2775	xxh_u8 const c2 = input[len >> 1];
				2776	xxh_u8 const c3 = input[len - 1];
				2777	xxh_u32 const combined = ((xxh_u32)c1 << 16) \| ((xxh_u32)c2 << 24)
				2778	\| ((xxh_u32)c3 << 0) \| ((xxh_u32)len << 8);
				2779	xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
				2780	xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
				2781	return XXH64_avalanche(keyed);
				2782	}
				2783	}
				2784
				2785	XXH_FORCE_INLINE XXH64_hash_t
				2786	XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				2787	{
				2788	XXH_ASSERT(input != NULL);
				2789	XXH_ASSERT(secret != NULL);
				2790	XXH_ASSERT(4 <= len && len < 8);
				2791	seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
				2792	{ xxh_u32 const input1 = XXH_readLE32(input);
				2793	xxh_u32 const input2 = XXH_readLE32(input + len - 4);
				2794	xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
				2795	xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
				2796	xxh_u64 const keyed = input64 ^ bitflip;
				2797	return XXH3_rrmxmx(keyed, len);
				2798	}
				2799	}
				2800
				2801	XXH_FORCE_INLINE XXH64_hash_t
				2802	XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				2803	{
				2804	XXH_ASSERT(input != NULL);
				2805	XXH_ASSERT(secret != NULL);
				2806	XXH_ASSERT(8 <= len && len <= 16);
				2807	{ xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
				2808	xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
				2809	xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
				2810	xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
				2811	xxh_u64 const acc = len
				2812	+ XXH_swap64(input_lo) + input_hi
				2813	+ XXH3_mul128_fold64(input_lo, input_hi);
				2814	return XXH3_avalanche(acc);
				2815	}
				2816	}
				2817
				2818	XXH_FORCE_INLINE XXH64_hash_t
				2819	XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				2820	{
				2821	XXH_ASSERT(len <= 16);
				2822	{ if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed);
				2823	if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
				2824	if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
				2825	return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
				2826	}
				2827	}
				2828
				2829	/*
				2830	* DISCLAIMER: There are known seed-dependent multicollisions here due to
				2831	* multiplication by zero, affecting hashes of lengths 17 to 240.
				2832	*
				2833	* However, they are very unlikely.
				2834	*
				2835	* Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
				2836	* unseeded non-cryptographic hashes, it does not attempt to defend itself
				2837	* against specially crafted inputs, only random inputs.
				2838	*
				2839	* Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
				2840	* cancelling out the secret is taken an arbitrary number of times (addressed
				2841	* in XXH3_accumulate_512), this collision is very unlikely with random inputs
				2842	* and/or proper seeding:
				2843	*
				2844	* This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
				2845	* function that is only called up to 16 times per hash with up to 240 bytes of
				2846	* input.
				2847	*
				2848	* This is not too bad for a non-cryptographic hash function, especially with
				2849	* only 64 bit outputs.
				2850	*
				2851	* The 128-bit variant (which trades some speed for strength) is NOT affected
				2852	* by this, although it is always a good idea to use a proper seed if you care
				2853	* about strength.
				2854	*/
				2855	XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
				2856	const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
				2857	{
				2858	#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
				2859	&& defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \
				2860	&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */
				2861	/*
				2862	* UGLY HACK:
				2863	* GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
				2864	* slower code.
				2865	*
				2866	* By forcing seed64 into a register, we disrupt the cost model and
				2867	* cause it to scalarize. See `XXH32_round()`
				2868	*
				2869	* FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
				2870	* XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
				2871	* GCC 9.2, despite both emitting scalar code.
				2872	*
				2873	* GCC generates much better scalar code than Clang for the rest of XXH3,
				2874	* which is why finding a more optimal codepath is an interest.
				2875	*/
				2876	__asm__ ("" : "+r" (seed64));
				2877	#endif
				2878	{ xxh_u64 const input_lo = XXH_readLE64(input);
				2879	xxh_u64 const input_hi = XXH_readLE64(input+8);
				2880	return XXH3_mul128_fold64(
				2881	input_lo ^ (XXH_readLE64(secret) + seed64),
				2882	input_hi ^ (XXH_readLE64(secret+8) - seed64)
				2883	);
				2884	}
				2885	}
				2886
				2887	/* For mid range keys, XXH3 uses a Mum-hash variant. */
				2888	XXH_FORCE_INLINE XXH64_hash_t
				2889	XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
				2890	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				2891	XXH64_hash_t seed)
				2892	{
				2893	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
				2894	XXH_ASSERT(16 < len && len <= 128);
				2895
				2896	{ xxh_u64 acc = len * XXH_PRIME64_1;
				2897	if (len > 32) {
				2898	if (len > 64) {
				2899	if (len > 96) {
				2900	acc += XXH3_mix16B(input+48, secret+96, seed);
				2901	acc += XXH3_mix16B(input+len-64, secret+112, seed);
				2902	}
				2903	acc += XXH3_mix16B(input+32, secret+64, seed);
				2904	acc += XXH3_mix16B(input+len-48, secret+80, seed);
				2905	}
				2906	acc += XXH3_mix16B(input+16, secret+32, seed);
				2907	acc += XXH3_mix16B(input+len-32, secret+48, seed);
				2908	}
				2909	acc += XXH3_mix16B(input+0, secret+0, seed);
				2910	acc += XXH3_mix16B(input+len-16, secret+16, seed);
				2911
				2912	return XXH3_avalanche(acc);
				2913	}
				2914	}
				2915
				2916	#define XXH3_MIDSIZE_MAX 240
				2917
				2918	XXH_NO_INLINE XXH64_hash_t
				2919	XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
				2920	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				2921	XXH64_hash_t seed)
				2922	{
				2923	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
				2924	XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
				2925
				2926	#define XXH3_MIDSIZE_STARTOFFSET 3
				2927	#define XXH3_MIDSIZE_LASTOFFSET 17
				2928
				2929	{ xxh_u64 acc = len * XXH_PRIME64_1;
				2930	int const nbRounds = (int)len / 16;
				2931	int i;
				2932	for (i=0; i<8; i++) {
				2933	acc += XXH3_mix16B(input+(16i), secret+(16i), seed);
				2934	}
				2935	acc = XXH3_avalanche(acc);
				2936	XXH_ASSERT(nbRounds >= 8);
				2937	#if defined(__clang__) /* Clang */ \
				2938	&& (defined(__ARM_NEON) \|\| defined(__ARM_NEON__)) /* NEON */ \
				2939	&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
				2940	/*
				2941	* UGLY HACK:
				2942	* Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
				2943	* In everywhere else, it uses scalar code.
				2944	*
				2945	* For 64->128-bit multiplies, even if the NEON was 100% optimal, it
				2946	* would still be slower than UMAAL (see XXH_mult64to128).
				2947	*
				2948	* Unfortunately, Clang doesn't handle the long multiplies properly and
				2949	* converts them to the nonexistent "vmulq_u64" intrinsic, which is then
				2950	* scalarized into an ugly mess of VMOV.32 instructions.
				2951	*
				2952	* This mess is difficult to avoid without turning autovectorization
				2953	* off completely, but they are usually relatively minor and/or not
				2954	* worth it to fix.
				2955	*
				2956	* This loop is the easiest to fix, as unlike XXH32, this pragma
				2957	* _actually works_ because it is a loop vectorization instead of an
				2958	* SLP vectorization.
				2959	*/
				2960	#pragma clang loop vectorize(disable)
				2961	#endif
				2962	for (i=8 ; i < nbRounds; i++) {
				2963	acc += XXH3_mix16B(input+(16i), secret+(16(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
				2964	}
				2965	/* last bytes */
				2966	acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
				2967	return XXH3_avalanche(acc);
				2968	}
				2969	}
				2970
				2971
				2972	/* ======= Long Keys ======= */
				2973
				2974	#define XXH_STRIPE_LEN 64
				2975	#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
				2976	#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
				2977
				2978	#ifdef XXH_OLD_NAMES
				2979	# define STRIPE_LEN XXH_STRIPE_LEN
				2980	# define ACC_NB XXH_ACC_NB
				2981	#endif
				2982
				2983	XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
				2984	{
				2985	if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
				2986	memcpy(dst, &v64, sizeof(v64));
				2987	}
				2988
				2989	/* Several intrinsic functions below are supposed to accept __int64 as argument,
				2990	* as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
				2991	* However, several environments do not define __int64 type,
				2992	* requiring a workaround.
				2993	*/
				2994	#if !defined (__VMS) \
				2995	&& (defined (__cplusplus) \
				2996	\|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
				2997	typedef int64_t xxh_i64;
				2998	#else
				2999	/* the following type must have a width of 64-bit */
				3000	typedef long long xxh_i64;
				3001	#endif
				3002
				3003	/*
				3004	* XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
				3005	*
				3006	* It is a hardened version of UMAC, based off of FARSH's implementation.
				3007	*
				3008	* This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
				3009	* implementations, and it is ridiculously fast.
				3010	*
				3011	* We harden it by mixing the original input to the accumulators as well as the product.
				3012	*
				3013	* This means that in the (relatively likely) case of a multiply by zero, the
				3014	* original input is preserved.
				3015	*
				3016	* On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
				3017	* cross-pollination, as otherwise the upper and lower halves would be
				3018	* essentially independent.
				3019	*
				3020	* This doesn't matter on 64-bit hashes since they all get merged together in
				3021	* the end, so we skip the extra step.
				3022	*
				3023	* Both XXH3_64bits and XXH3_128bits use this subroutine.
				3024	*/
				3025
				3026	#if (XXH_VECTOR == XXH_AVX512) \|\| defined(XXH_X86DISPATCH)
				3027
				3028	#ifndef XXH_TARGET_AVX512
				3029	# define XXH_TARGET_AVX512 /* disable attribute target */
				3030	#endif
				3031
				3032	XXH_FORCE_INLINE XXH_TARGET_AVX512 void
				3033	XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
				3034	const void* XXH_RESTRICT input,
				3035	const void* XXH_RESTRICT secret)
				3036	{
				3037	XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;
				3038	XXH_ASSERT((((size_t)acc) & 63) == 0);
				3039	XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
				3040
				3041	{
				3042	/* data_vec = input[0]; */
				3043	__m512i const data_vec = _mm512_loadu_si512 (input);
				3044	/* key_vec = secret[0]; */
				3045	__m512i const key_vec = _mm512_loadu_si512 (secret);
				3046	/* data_key = data_vec ^ key_vec; */
				3047	__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
				3048	/* data_key_lo = data_key >> 32; */
				3049	__m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
				3050	/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
				3051	__m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);
				3052	/* xacc[0] += swap(data_vec); */
				3053	__m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
				3054	__m512i const sum = _mm512_add_epi64(*xacc, data_swap);
				3055	/* xacc[0] += product; */
				3056	*xacc = _mm512_add_epi64(product, sum);
				3057	}
				3058	}
				3059
				3060	/*
				3061	* XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
				3062	*
				3063	* Multiplication isn't perfect, as explained by Google in HighwayHash:
				3064	*
				3065	* // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
				3066	* // varying degrees. In descending order of goodness, bytes
				3067	* // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
				3068	* // As expected, the upper and lower bytes are much worse.
				3069	*
				3070	* Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
				3071	*
				3072	* Since our algorithm uses a pseudorandom secret to add some variance into the
				3073	* mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
				3074	*
				3075	* This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
				3076	* extraction.
				3077	*
				3078	* Both XXH3_64bits and XXH3_128bits use this subroutine.
				3079	*/
				3080
				3081	XXH_FORCE_INLINE XXH_TARGET_AVX512 void
				3082	XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3083	{
				3084	XXH_ASSERT((((size_t)acc) & 63) == 0);
				3085	XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
				3086	{ XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
				3087	const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
				3088
				3089	/* xacc[0] ^= (xacc[0] >> 47) */
				3090	__m512i const acc_vec = *xacc;
				3091	__m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);
				3092	__m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);
				3093	/* xacc[0] ^= secret; */
				3094	__m512i const key_vec = _mm512_loadu_si512 (secret);
				3095	__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
				3096
				3097	/* xacc[0] = XXH_PRIME32_1; /
				3098	__m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
				3099	__m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);
				3100	__m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);
				3101	*xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
				3102	}
				3103	}
				3104
				3105	XXH_FORCE_INLINE XXH_TARGET_AVX512 void
				3106	XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
				3107	{
				3108	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
				3109	XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
				3110	XXH_ASSERT(((size_t)customSecret & 63) == 0);
				3111	(void)(&XXH_writeLE64);
				3112	{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
				3113	__m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
				3114
				3115	XXH_ALIGN(64) const __m512i* const src = (const __m512i*) XXH3_kSecret;
				3116	XXH_ALIGN(64) __m512i* const dest = ( __m512i*) customSecret;
				3117	int i;
				3118	for (i=0; i < nbRounds; ++i) {
				3119	/* GCC has a bug, _mm512_stream_load_si512 accepts 'void', not 'void const',
				3120	* this will warn "discards ‘const’ qualifier". */
				3121	union {
				3122	XXH_ALIGN(64) const __m512i* cp;
				3123	XXH_ALIGN(64) void* p;
				3124	} remote_const_void;
				3125	remote_const_void.cp = src + i;
				3126	dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
				3127	} }
				3128	}
				3129
				3130	#endif
				3131
				3132	#if (XXH_VECTOR == XXH_AVX2) \|\| defined(XXH_X86DISPATCH)
				3133
				3134	#ifndef XXH_TARGET_AVX2
				3135	# define XXH_TARGET_AVX2 /* disable attribute target */
				3136	#endif
				3137
				3138	XXH_FORCE_INLINE XXH_TARGET_AVX2 void
				3139	XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
				3140	const void* XXH_RESTRICT input,
				3141	const void* XXH_RESTRICT secret)
				3142	{
				3143	XXH_ASSERT((((size_t)acc) & 31) == 0);
				3144	{ XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
				3145	/* Unaligned. This is mainly for pointer arithmetic, and because
				3146	* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
				3147	const __m256i* const xinput = (const __m256i *) input;
				3148	/* Unaligned. This is mainly for pointer arithmetic, and because
				3149	* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
				3150	const __m256i* const xsecret = (const __m256i *) secret;
				3151
				3152	size_t i;
				3153	for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
				3154	/* data_vec = xinput[i]; */
				3155	__m256i const data_vec = _mm256_loadu_si256 (xinput+i);
				3156	/* key_vec = xsecret[i]; */
				3157	__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
				3158	/* data_key = data_vec ^ key_vec; */
				3159	__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
				3160	/* data_key_lo = data_key >> 32; */
				3161	__m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
				3162	/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
				3163	__m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);
				3164	/* xacc[i] += swap(data_vec); */
				3165	__m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
				3166	__m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
				3167	/* xacc[i] += product; */
				3168	xacc[i] = _mm256_add_epi64(product, sum);
				3169	} }
				3170	}
				3171
				3172	XXH_FORCE_INLINE XXH_TARGET_AVX2 void
				3173	XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3174	{
				3175	XXH_ASSERT((((size_t)acc) & 31) == 0);
				3176	{ XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
				3177	/* Unaligned. This is mainly for pointer arithmetic, and because
				3178	* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
				3179	const __m256i* const xsecret = (const __m256i *) secret;
				3180	const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
				3181
				3182	size_t i;
				3183	for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
				3184	/* xacc[i] ^= (xacc[i] >> 47) */
				3185	__m256i const acc_vec = xacc[i];
				3186	__m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);
				3187	__m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);
				3188	/* xacc[i] ^= xsecret; */
				3189	__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
				3190	__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
				3191
				3192	/* xacc[i] = XXH_PRIME32_1; /
				3193	__m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
				3194	__m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
				3195	__m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
				3196	xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
				3197	}
				3198	}
				3199	}
				3200
				3201	XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
				3202	{
				3203	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
				3204	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
				3205	XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
				3206	(void)(&XXH_writeLE64);
				3207	XXH_PREFETCH(customSecret);
				3208	{ __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, -(xxh_i64)seed64, (xxh_i64)seed64);
				3209
				3210	XXH_ALIGN(64) const __m256i* const src = (const __m256i*) XXH3_kSecret;
				3211	XXH_ALIGN(64) __m256i* dest = ( __m256i*) customSecret;
				3212
				3213	# if defined(__GNUC__) \|\| defined(__clang__)
				3214	/*
				3215	* On GCC & Clang, marking 'dest' as modified will cause the compiler:
				3216	* - do not extract the secret from sse registers in the internal loop
				3217	* - use less common registers, and avoid pushing these reg into stack
				3218	* The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
				3219	* customSecret, and on aarch64, this prevented LDP from merging two
				3220	* loads together for free. Putting the loads together before the stores
				3221	* properly generates LDP.
				3222	*/
				3223	__asm__("" : "+r" (dest));
				3224	# endif
				3225
				3226	/* GCC -O2 need unroll loop manually */
				3227	dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
				3228	dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
				3229	dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
				3230	dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
				3231	dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
				3232	dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
				3233	}
				3234	}
				3235
				3236	#endif
				3237
				3238	#if (XXH_VECTOR == XXH_SSE2) \|\| defined(XXH_X86DISPATCH)
				3239
				3240	#ifndef XXH_TARGET_SSE2
				3241	# define XXH_TARGET_SSE2 /* disable attribute target */
				3242	#endif
				3243
				3244	XXH_FORCE_INLINE XXH_TARGET_SSE2 void
				3245	XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
				3246	const void* XXH_RESTRICT input,
				3247	const void* XXH_RESTRICT secret)
				3248	{
				3249	/* SSE2 is just a half-scale version of the AVX2 version. */
				3250	XXH_ASSERT((((size_t)acc) & 15) == 0);
				3251	{ XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
				3252	/* Unaligned. This is mainly for pointer arithmetic, and because
				3253	* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
				3254	const __m128i* const xinput = (const __m128i *) input;
				3255	/* Unaligned. This is mainly for pointer arithmetic, and because
				3256	* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
				3257	const __m128i* const xsecret = (const __m128i *) secret;
				3258
				3259	size_t i;
				3260	for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
				3261	/* data_vec = xinput[i]; */
				3262	__m128i const data_vec = _mm_loadu_si128 (xinput+i);
				3263	/* key_vec = xsecret[i]; */
				3264	__m128i const key_vec = _mm_loadu_si128 (xsecret+i);
				3265	/* data_key = data_vec ^ key_vec; */
				3266	__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
				3267	/* data_key_lo = data_key >> 32; */
				3268	__m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
				3269	/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
				3270	__m128i const product = _mm_mul_epu32 (data_key, data_key_lo);
				3271	/* xacc[i] += swap(data_vec); */
				3272	__m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
				3273	__m128i const sum = _mm_add_epi64(xacc[i], data_swap);
				3274	/* xacc[i] += product; */
				3275	xacc[i] = _mm_add_epi64(product, sum);
				3276	} }
				3277	}
				3278
				3279	XXH_FORCE_INLINE XXH_TARGET_SSE2 void
				3280	XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3281	{
				3282	XXH_ASSERT((((size_t)acc) & 15) == 0);
				3283	{ XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
				3284	/* Unaligned. This is mainly for pointer arithmetic, and because
				3285	* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
				3286	const __m128i* const xsecret = (const __m128i *) secret;
				3287	const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
				3288
				3289	size_t i;
				3290	for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
				3291	/* xacc[i] ^= (xacc[i] >> 47) */
				3292	__m128i const acc_vec = xacc[i];
				3293	__m128i const shifted = _mm_srli_epi64 (acc_vec, 47);
				3294	__m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);
				3295	/* xacc[i] ^= xsecret[i]; */
				3296	__m128i const key_vec = _mm_loadu_si128 (xsecret+i);
				3297	__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
				3298
				3299	/* xacc[i] = XXH_PRIME32_1; /
				3300	__m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
				3301	__m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);
				3302	__m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);
				3303	xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
				3304	}
				3305	}
				3306	}
				3307
				3308	XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
				3309	{
				3310	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
				3311	(void)(&XXH_writeLE64);
				3312	{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
				3313
				3314	# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
				3315	// MSVC 32bit mode does not support _mm_set_epi64x before 2015
				3316	XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, -(xxh_i64)seed64 };
				3317	__m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
				3318	# else
				3319	__m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
				3320	# endif
				3321	int i;
				3322
				3323	XXH_ALIGN(64) const float* const src = (float const*) XXH3_kSecret;
				3324	XXH_ALIGN(XXH_SEC_ALIGN) __m128i* dest = (__m128i*) customSecret;
				3325	# if defined(__GNUC__) \|\| defined(__clang__)
				3326	/*
				3327	* On GCC & Clang, marking 'dest' as modified will cause the compiler:
				3328	* - do not extract the secret from sse registers in the internal loop
				3329	* - use less common registers, and avoid pushing these reg into stack
				3330	*/
				3331	__asm__("" : "+r" (dest));
				3332	# endif
				3333
				3334	for (i=0; i < nbRounds; ++i) {
				3335	dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed);
				3336	} }
				3337	}
				3338
				3339	#endif
				3340
				3341	#if (XXH_VECTOR == XXH_NEON)
				3342
				3343	XXH_FORCE_INLINE void
				3344	XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
				3345	const void* XXH_RESTRICT input,
				3346	const void* XXH_RESTRICT secret)
				3347	{
				3348	XXH_ASSERT((((size_t)acc) & 15) == 0);
				3349	{
				3350	XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
				3351	/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
				3352	uint8_t const* const xinput = (const uint8_t *) input;
				3353	uint8_t const* const xsecret = (const uint8_t *) secret;
				3354
				3355	size_t i;
				3356	for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
				3357	/* data_vec = xinput[i]; */
				3358	uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
				3359	/* key_vec = xsecret[i]; */
				3360	uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
				3361	uint64x2_t data_key;
				3362	uint32x2_t data_key_lo, data_key_hi;
				3363	/* xacc[i] += swap(data_vec); */
				3364	uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
				3365	uint64x2_t const swapped = vextq_u64(data64, data64, 1);
				3366	xacc[i] = vaddq_u64 (xacc[i], swapped);
				3367	/* data_key = data_vec ^ key_vec; */
				3368	data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
				3369	/* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
				3370	* data_key_hi = (uint32x2_t) (data_key >> 32);
				3371	* data_key = UNDEFINED; */
				3372	XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
				3373	/* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
				3374	xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
				3375
				3376	}
				3377	}
				3378	}
				3379
				3380	XXH_FORCE_INLINE void
				3381	XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3382	{
				3383	XXH_ASSERT((((size_t)acc) & 15) == 0);
				3384
				3385	{ uint64x2_t* xacc = (uint64x2_t*) acc;
				3386	uint8_t const* xsecret = (uint8_t const*) secret;
				3387	uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);
				3388
				3389	size_t i;
				3390	for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
				3391	/* xacc[i] ^= (xacc[i] >> 47); */
				3392	uint64x2_t acc_vec = xacc[i];
				3393	uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
				3394	uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
				3395
				3396	/* xacc[i] ^= xsecret[i]; */
				3397	uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
				3398	uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
				3399
				3400	/* xacc[i] = XXH_PRIME32_1 /
				3401	uint32x2_t data_key_lo, data_key_hi;
				3402	/* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
				3403	* data_key_hi = (uint32x2_t) (xacc[i] >> 32);
				3404	* xacc[i] = UNDEFINED; */
				3405	XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
				3406	{ /*
				3407	* prod_hi = (data_key >> 32) * XXH_PRIME32_1;
				3408	*
				3409	* Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
				3410	* incorrectly "optimize" this:
				3411	* tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
				3412	* shifted = vshll_n_u32(tmp, 32);
				3413	* to this:
				3414	* tmp = "vmulq_u64"(a, b); // no such thing!
				3415	* shifted = vshlq_n_u64(tmp, 32);
				3416	*
				3417	* However, unlike SSE, Clang lacks a 64-bit multiply routine
				3418	* for NEON, and it scalarizes two 64-bit multiplies instead.
				3419	*
				3420	* vmull_u32 has the same timing as vmul_u32, and it avoids
				3421	* this bug completely.
				3422	* See https://bugs.llvm.org/show_bug.cgi?id=39967
				3423	*/
				3424	uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
				3425	/* xacc[i] = prod_hi << 32; */
				3426	xacc[i] = vshlq_n_u64(prod_hi, 32);
				3427	/* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
				3428	xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
				3429	}
				3430	} }
				3431	}
				3432
				3433	#endif
				3434
				3435	#if (XXH_VECTOR == XXH_VSX)
				3436
				3437	XXH_FORCE_INLINE void
				3438	XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
				3439	const void* XXH_RESTRICT input,
				3440	const void* XXH_RESTRICT secret)
				3441	{
				3442	xxh_u64x2* const xacc = (xxh_u64x2) acc; / presumed aligned */
				3443	xxh_u64x2 const* const xinput = (xxh_u64x2 const) input; / no alignment restriction */
				3444	xxh_u64x2 const* const xsecret = (xxh_u64x2 const) secret; / no alignment restriction */
				3445	xxh_u64x2 const v32 = { 32, 32 };
				3446	size_t i;
				3447	for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
				3448	/* data_vec = xinput[i]; */
				3449	xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
				3450	/* key_vec = xsecret[i]; */
				3451	xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
				3452	xxh_u64x2 const data_key = data_vec ^ key_vec;
				3453	/* shuffled = (data_key << 32) \| (data_key >> 32); */
				3454	xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
				3455	/* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
				3456	xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
				3457	xacc[i] += product;
				3458
				3459	/* swap high and low halves */
				3460	#ifdef __s390x__
				3461	xacc[i] += vec_permi(data_vec, data_vec, 2);
				3462	#else
				3463	xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
				3464	#endif
				3465	}
				3466	}
				3467
				3468	XXH_FORCE_INLINE void
				3469	XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3470	{
				3471	XXH_ASSERT((((size_t)acc) & 15) == 0);
				3472
				3473	{ xxh_u64x2* const xacc = (xxh_u64x2*) acc;
				3474	const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
				3475	/* constants */
				3476	xxh_u64x2 const v32 = { 32, 32 };
				3477	xxh_u64x2 const v47 = { 47, 47 };
				3478	xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
				3479	size_t i;
				3480	for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
				3481	/* xacc[i] ^= (xacc[i] >> 47); */
				3482	xxh_u64x2 const acc_vec = xacc[i];
				3483	xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
				3484
				3485	/* xacc[i] ^= xsecret[i]; */
				3486	xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
				3487	xxh_u64x2 const data_key = data_vec ^ key_vec;
				3488
				3489	/* xacc[i] = XXH_PRIME32_1 /
				3490	/* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */
				3491	xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
				3492	/* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */
				3493	xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
				3494	xacc[i] = prod_odd + (prod_even << v32);
				3495	} }
				3496	}
				3497
				3498	#endif
				3499
				3500	/* scalar variants - universal */
				3501
				3502	XXH_FORCE_INLINE void
				3503	XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
				3504	const void* XXH_RESTRICT input,
				3505	const void* XXH_RESTRICT secret)
				3506	{
				3507	XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64) acc; / presumed aligned */
				3508	const xxh_u8* const xinput = (const xxh_u8) input; / no alignment restriction */
				3509	const xxh_u8* const xsecret = (const xxh_u8) secret; / no alignment restriction */
				3510	size_t i;
				3511	XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
				3512	for (i=0; i < XXH_ACC_NB; i++) {
				3513	xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
				3514	xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
				3515	xacc[i ^ 1] += data_val; /* swap adjacent lanes */
				3516	xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
				3517	}
				3518	}
				3519
				3520	XXH_FORCE_INLINE void
				3521	XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3522	{
				3523	XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64) acc; / presumed aligned */
				3524	const xxh_u8* const xsecret = (const xxh_u8) secret; / no alignment restriction */
				3525	size_t i;
				3526	XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
				3527	for (i=0; i < XXH_ACC_NB; i++) {
				3528	xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
				3529	xxh_u64 acc64 = xacc[i];
				3530	acc64 = XXH_xorshift64(acc64, 47);
				3531	acc64 ^= key64;
				3532	acc64 *= XXH_PRIME32_1;
				3533	xacc[i] = acc64;
				3534	}
				3535	}
				3536
				3537	XXH_FORCE_INLINE void
				3538	XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
				3539	{
				3540	/*
				3541	* We need a separate pointer for the hack below,
				3542	* which requires a non-const pointer.
				3543	* Any decent compiler will optimize this out otherwise.
				3544	*/
				3545	const xxh_u8* kSecretPtr = XXH3_kSecret;
				3546	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
				3547
				3548	#if defined(__clang__) && defined(__aarch64__)
				3549	/*
				3550	* UGLY HACK:
				3551	* Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
				3552	* placed sequentially, in order, at the top of the unrolled loop.
				3553	*
				3554	* While MOVK is great for generating constants (2 cycles for a 64-bit
				3555	* constant compared to 4 cycles for LDR), long MOVK chains stall the
				3556	* integer pipelines:
				3557	* I L S
				3558	* MOVK
				3559	* MOVK
				3560	* MOVK
				3561	* MOVK
				3562	* ADD
				3563	* SUB STR
				3564	* STR
				3565	* By forcing loads from memory (as the asm line causes Clang to assume
				3566	* that XXH3_kSecretPtr has been changed), the pipelines are used more
				3567	* efficiently:
				3568	* I L S
				3569	* LDR
				3570	* ADD LDR
				3571	* SUB STR
				3572	* STR
				3573	* XXH3_64bits_withSeed, len == 256, Snapdragon 835
				3574	* without hack: 2654.4 MB/s
				3575	* with hack: 3202.9 MB/s
				3576	*/
				3577	__asm__("" : "+r" (kSecretPtr));
				3578	#endif
				3579	/*
				3580	* Note: in debug mode, this overrides the asm optimization
				3581	* and Clang will emit MOVK chains again.
				3582	*/
				3583	XXH_ASSERT(kSecretPtr == XXH3_kSecret);
				3584
				3585	{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
				3586	int i;
				3587	for (i=0; i < nbRounds; i++) {
				3588	/*
				3589	* The asm hack causes Clang to assume that kSecretPtr aliases with
				3590	* customSecret, and on aarch64, this prevented LDP from merging two
				3591	* loads together for free. Putting the loads together before the stores
				3592	* properly generates LDP.
				3593	*/
				3594	xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64;
				3595	xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
				3596	XXH_writeLE64((xxh_u8)customSecret + 16i, lo);
				3597	XXH_writeLE64((xxh_u8)customSecret + 16i + 8, hi);
				3598	} }
				3599	}
				3600
				3601
				3602	typedef void (XXH3_f_accumulate_512)(void XXH_RESTRICT, const void, const void);
				3603	typedef void (XXH3_f_scrambleAcc)(void XXH_RESTRICT, const void*);
				3604	typedef void (XXH3_f_initCustomSecret)(void XXH_RESTRICT, xxh_u64);
				3605
				3606
				3607	#if (XXH_VECTOR == XXH_AVX512)
				3608
				3609	#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
				3610	#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
				3611	#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
				3612
				3613	#elif (XXH_VECTOR == XXH_AVX2)
				3614
				3615	#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
				3616	#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
				3617	#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
				3618
				3619	#elif (XXH_VECTOR == XXH_SSE2)
				3620
				3621	#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
				3622	#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
				3623	#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
				3624
				3625	#elif (XXH_VECTOR == XXH_NEON)
				3626
				3627	#define XXH3_accumulate_512 XXH3_accumulate_512_neon
				3628	#define XXH3_scrambleAcc XXH3_scrambleAcc_neon
				3629	#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
				3630
				3631	#elif (XXH_VECTOR == XXH_VSX)
				3632
				3633	#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
				3634	#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
				3635	#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
				3636
				3637	#else /* scalar */
				3638
				3639	#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
				3640	#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
				3641	#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
				3642
				3643	#endif
				3644
				3645
				3646
				3647	#ifndef XXH_PREFETCH_DIST
				3648	# ifdef __clang__
				3649	# define XXH_PREFETCH_DIST 320
				3650	# else
				3651	# if (XXH_VECTOR == XXH_AVX512)
				3652	# define XXH_PREFETCH_DIST 512
				3653	# else
				3654	# define XXH_PREFETCH_DIST 384
				3655	# endif
				3656	# endif /* __clang__ */
				3657	#endif /* XXH_PREFETCH_DIST */
				3658
				3659	/*
				3660	* XXH3_accumulate()
				3661	* Loops over XXH3_accumulate_512().
				3662	* Assumption: nbStripes will not overflow the secret size
				3663	*/
				3664	XXH_FORCE_INLINE void
				3665	XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,
				3666	const xxh_u8* XXH_RESTRICT input,
				3667	const xxh_u8* XXH_RESTRICT secret,
				3668	size_t nbStripes,
				3669	XXH3_f_accumulate_512 f_acc512)
				3670	{
				3671	size_t n;
				3672	for (n = 0; n < nbStripes; n++ ) {
				3673	const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
				3674	XXH_PREFETCH(in + XXH_PREFETCH_DIST);
				3675	f_acc512(acc,
				3676	in,
				3677	secret + n*XXH_SECRET_CONSUME_RATE);
				3678	}
				3679	}
				3680
				3681	XXH_FORCE_INLINE void
				3682	XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
				3683	const xxh_u8* XXH_RESTRICT input, size_t len,
				3684	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				3685	XXH3_f_accumulate_512 f_acc512,
				3686	XXH3_f_scrambleAcc f_scramble)
				3687	{
				3688	size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
				3689	size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
				3690	size_t const nb_blocks = (len - 1) / block_len;
				3691
				3692	size_t n;
				3693
				3694	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
				3695
				3696	for (n = 0; n < nb_blocks; n++) {
				3697	XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
				3698	f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
				3699	}
				3700
				3701	/* last partial block */
				3702	XXH_ASSERT(len > XXH_STRIPE_LEN);
				3703	{ size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
				3704	XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
				3705	XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
				3706
				3707	/* last stripe */
				3708	{ const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
				3709	#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */
				3710	f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
				3711	} }
				3712	}
				3713
				3714	XXH_FORCE_INLINE xxh_u64
				3715	XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
				3716	{
				3717	return XXH3_mul128_fold64(
				3718	acc[0] ^ XXH_readLE64(secret),
				3719	acc[1] ^ XXH_readLE64(secret+8) );
				3720	}
				3721
				3722	static XXH64_hash_t
				3723	XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
				3724	{
				3725	xxh_u64 result64 = start;
				3726	size_t i = 0;
				3727
				3728	for (i = 0; i < 4; i++) {
				3729	result64 += XXH3_mix2Accs(acc+2i, secret + 16i);
				3730	#if defined(__clang__) /* Clang */ \
				3731	&& (defined(__arm__) \|\| defined(__thumb__)) /* ARMv7 */ \
				3732	&& (defined(__ARM_NEON) \|\| defined(__ARM_NEON__)) /* NEON */ \
				3733	&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
				3734	/*
				3735	* UGLY HACK:
				3736	* Prevent autovectorization on Clang ARMv7-a. Exact same problem as
				3737	* the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
				3738	* XXH3_64bits, len == 256, Snapdragon 835:
				3739	* without hack: 2063.7 MB/s
				3740	* with hack: 2560.7 MB/s
				3741	*/
				3742	__asm__("" : "+r" (result64));
				3743	#endif
				3744	}
				3745
				3746	return XXH3_avalanche(result64);
				3747	}
				3748
				3749	#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
				3750	XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
				3751
				3752	XXH_FORCE_INLINE XXH64_hash_t
				3753	XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
				3754	const void* XXH_RESTRICT secret, size_t secretSize,
				3755	XXH3_f_accumulate_512 f_acc512,
				3756	XXH3_f_scrambleAcc f_scramble)
				3757	{
				3758	XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
				3759
				3760	XXH3_hashLong_internal_loop(acc, (const xxh_u8)input, len, (const xxh_u8)secret, secretSize, f_acc512, f_scramble);
				3761
				3762	/* converge into final hash */
				3763	XXH_STATIC_ASSERT(sizeof(acc) == 64);
				3764	/* do not align on 8, so that the secret is different from the accumulator */
				3765	#define XXH_SECRET_MERGEACCS_START 11
				3766	XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
				3767	return XXH3_mergeAccs(acc, (const xxh_u8)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len XXH_PRIME64_1);
				3768	}
				3769
				3770	/*
				3771	* It's important for performance that XXH3_hashLong is not inlined.
				3772	*/
				3773	XXH_NO_INLINE XXH64_hash_t
				3774	XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
				3775	XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
				3776	{
				3777	(void)seed64;
				3778	return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
				3779	}
				3780
				3781	/*
				3782	* It's important for performance that XXH3_hashLong is not inlined.
				3783	* Since the function is not inlined, the compiler may not be able to understand that,
				3784	* in some scenarios, its `secret` argument is actually a compile time constant.
				3785	* This variant enforces that the compiler can detect that,
				3786	* and uses this opportunity to streamline the generated code for better performance.
				3787	*/
				3788	XXH_NO_INLINE XXH64_hash_t
				3789	XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
				3790	XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
				3791	{
				3792	(void)seed64; (void)secret; (void)secretLen;
				3793	return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
				3794	}
				3795
				3796	/*
				3797	* XXH3_hashLong_64b_withSeed():
				3798	* Generate a custom key based on alteration of default XXH3_kSecret with the seed,
				3799	* and then use this key for long mode hashing.
				3800	*
				3801	* This operation is decently fast but nonetheless costs a little bit of time.
				3802	* Try to avoid it whenever possible (typically when seed==0).
				3803	*
				3804	* It's important for performance that XXH3_hashLong is not inlined. Not sure
				3805	* why (uop cache maybe?), but the difference is large and easily measurable.
				3806	*/
				3807	XXH_FORCE_INLINE XXH64_hash_t
				3808	XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
				3809	XXH64_hash_t seed,
				3810	XXH3_f_accumulate_512 f_acc512,
				3811	XXH3_f_scrambleAcc f_scramble,
				3812	XXH3_f_initCustomSecret f_initSec)
				3813	{
				3814	if (seed == 0)
				3815	return XXH3_hashLong_64b_internal(input, len,
				3816	XXH3_kSecret, sizeof(XXH3_kSecret),
				3817	f_acc512, f_scramble);
				3818	{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
				3819	f_initSec(secret, seed);
				3820	return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
				3821	f_acc512, f_scramble);
				3822	}
				3823	}
				3824
				3825	/*
				3826	* It's important for performance that XXH3_hashLong is not inlined.
				3827	*/
				3828	XXH_NO_INLINE XXH64_hash_t
				3829	XXH3_hashLong_64b_withSeed(const void* input, size_t len,
				3830	XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
				3831	{
				3832	(void)secret; (void)secretLen;
				3833	return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
				3834	XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
				3835	}
				3836
				3837
				3838	typedef XXH64_hash_t (XXH3_hashLong64_f)(const void XXH_RESTRICT, size_t,
				3839	XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
				3840
				3841	XXH_FORCE_INLINE XXH64_hash_t
				3842	XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
				3843	XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
				3844	XXH3_hashLong64_f f_hashLong)
				3845	{
				3846	XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
				3847	/*
				3848	* If an action is to be taken if `secretLen` condition is not respected,
				3849	* it should be done here.
				3850	* For now, it's a contract pre-condition.
				3851	* Adding a check and a branch here would cost performance at every hash.
				3852	* Also, note that function signature doesn't offer room to return an error.
				3853	*/
				3854	if (len <= 16)
				3855	return XXH3_len_0to16_64b((const xxh_u8)input, len, (const xxh_u8)secret, seed64);
				3856	if (len <= 128)
				3857	return XXH3_len_17to128_64b((const xxh_u8)input, len, (const xxh_u8)secret, secretLen, seed64);
				3858	if (len <= XXH3_MIDSIZE_MAX)
				3859	return XXH3_len_129to240_64b((const xxh_u8)input, len, (const xxh_u8)secret, secretLen, seed64);
				3860	return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
				3861	}
				3862
				3863
				3864	/* === Public entry point === */
				3865
				3866	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
				3867	{
				3868	return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
				3869	}
				3870
				3871	XXH_PUBLIC_API XXH64_hash_t
				3872	XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
				3873	{
				3874	return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
				3875	}
				3876
				3877	XXH_PUBLIC_API XXH64_hash_t
				3878	XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
				3879	{
				3880	return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
				3881	}
				3882
				3883
				3884	/* === XXH3 streaming === */
				3885
				3886	/*
				3887	* Malloc's a pointer that is always aligned to align.
				3888	*
				3889	* This must be freed with `XXH_alignedFree()`.
				3890	*
				3891	* malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
				3892	* alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
				3893	* or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
				3894	*
				3895	* This underalignment previously caused a rather obvious crash which went
				3896	* completely unnoticed due to XXH3_createState() not actually being tested.
				3897	* Credit to RedSpah for noticing this bug.
				3898	*
				3899	* The alignment is done manually: Functions like posix_memalign or _mm_malloc
				3900	* are avoided: To maintain portability, we would have to write a fallback
				3901	* like this anyways, and besides, testing for the existence of library
				3902	* functions without relying on external build tools is impossible.
				3903	*
				3904	* The method is simple: Overallocate, manually align, and store the offset
				3905	* to the original behind the returned pointer.
				3906	*
				3907	* Align must be a power of 2 and 8 <= align <= 128.
				3908	*/
				3909	static void* XXH_alignedMalloc(size_t s, size_t align)
				3910	{
				3911	XXH_ASSERT(align <= 128 && align >= 8); /* range check */
				3912	XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */
				3913	XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */
				3914	{ /* Overallocate to make room for manual realignment and an offset byte */
				3915	xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
				3916	if (base != NULL) {
				3917	/*
				3918	* Get the offset needed to align this pointer.
				3919	*
				3920	* Even if the returned pointer is aligned, there will always be
				3921	* at least one byte to store the offset to the original pointer.
				3922	*/
				3923	size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
				3924	/* Add the offset for the now-aligned pointer */
				3925	xxh_u8* ptr = base + offset;
				3926
				3927	XXH_ASSERT((size_t)ptr % align == 0);
				3928
				3929	/* Store the offset immediately before the returned pointer. */
				3930	ptr[-1] = (xxh_u8)offset;
				3931	return ptr;
				3932	}
				3933	return NULL;
				3934	}
				3935	}
				3936	/*
				3937	* Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
				3938	* normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
				3939	*/
				3940	static void XXH_alignedFree(void* p)
				3941	{
				3942	if (p != NULL) {
				3943	xxh_u8* ptr = (xxh_u8*)p;
				3944	/* Get the offset byte we added in XXH_malloc. */
				3945	xxh_u8 offset = ptr[-1];
				3946	/* Free the original malloc'd pointer */
				3947	xxh_u8* base = ptr - offset;
				3948	XXH_free(base);
				3949	}
				3950	}
				3951	XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
				3952	{
				3953	XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
				3954	if (state==NULL) return NULL;
				3955	XXH3_INITSTATE(state);
				3956	return state;
				3957	}
				3958
				3959	XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
				3960	{
				3961	XXH_alignedFree(statePtr);
				3962	return XXH_OK;
				3963	}
				3964
				3965	XXH_PUBLIC_API void
				3966	XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
				3967	{
				3968	memcpy(dst_state, src_state, sizeof(*dst_state));
				3969	}
				3970
				3971	static void
				3972	XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
				3973	XXH64_hash_t seed,
				3974	const void* secret, size_t secretSize)
				3975	{
				3976	size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
				3977	size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
				3978	XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
				3979	XXH_ASSERT(statePtr != NULL);
				3980	/* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
				3981	memset((char*)statePtr + initStart, 0, initLength);
				3982	statePtr->acc[0] = XXH_PRIME32_3;
				3983	statePtr->acc[1] = XXH_PRIME64_1;
				3984	statePtr->acc[2] = XXH_PRIME64_2;
				3985	statePtr->acc[3] = XXH_PRIME64_3;
				3986	statePtr->acc[4] = XXH_PRIME64_4;
				3987	statePtr->acc[5] = XXH_PRIME32_2;
				3988	statePtr->acc[6] = XXH_PRIME64_5;
				3989	statePtr->acc[7] = XXH_PRIME32_1;
				3990	statePtr->seed = seed;
				3991	statePtr->extSecret = (const unsigned char*)secret;
				3992	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
				3993	statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
				3994	statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
				3995	}
				3996
				3997	XXH_PUBLIC_API XXH_errorcode
				3998	XXH3_64bits_reset(XXH3_state_t* statePtr)
				3999	{
				4000	if (statePtr == NULL) return XXH_ERROR;
				4001	XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
				4002	return XXH_OK;
				4003	}
				4004
				4005	XXH_PUBLIC_API XXH_errorcode
				4006	XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
				4007	{
				4008	if (statePtr == NULL) return XXH_ERROR;
				4009	XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
				4010	if (secret == NULL) return XXH_ERROR;
				4011	if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
				4012	return XXH_OK;
				4013	}
				4014
				4015	XXH_PUBLIC_API XXH_errorcode
				4016	XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
				4017	{
				4018	if (statePtr == NULL) return XXH_ERROR;
				4019	if (seed==0) return XXH3_64bits_reset(statePtr);
				4020	if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
				4021	XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
				4022	return XXH_OK;
				4023	}
				4024
				4025	/* Note : when XXH3_consumeStripes() is invoked,
				4026	* there must be a guarantee that at least one more byte must be consumed from input
				4027	* so that the function can blindly consume all stripes using the "normal" secret segment */
				4028	XXH_FORCE_INLINE void
				4029	XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
				4030	size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
				4031	const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
				4032	const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
				4033	XXH3_f_accumulate_512 f_acc512,
				4034	XXH3_f_scrambleAcc f_scramble)
				4035	{
				4036	XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */
				4037	XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
				4038	if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
				4039	/* need a scrambling operation */
				4040	size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
				4041	size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
				4042	XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
				4043	f_scramble(acc, secret + secretLimit);
				4044	XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
				4045	*nbStripesSoFarPtr = nbStripesAfterBlock;
				4046	} else {
				4047	XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
				4048	*nbStripesSoFarPtr += nbStripes;
				4049	}
				4050	}
				4051
				4052	/*
				4053	* Both XXH3_64bits_update and XXH3_128bits_update use this routine.
				4054	*/
				4055	XXH_FORCE_INLINE XXH_errorcode
				4056	XXH3_update(XXH3_state_t* state,
				4057	const xxh_u8* input, size_t len,
				4058	XXH3_f_accumulate_512 f_acc512,
				4059	XXH3_f_scrambleAcc f_scramble)
				4060	{
				4061	if (input==NULL)
				4062	#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
				4063	return XXH_OK;
				4064	#else
				4065	return XXH_ERROR;
				4066	#endif
				4067
				4068	{ const xxh_u8* const bEnd = input + len;
				4069	const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
				4070
				4071	state->totalLen += len;
				4072
				4073	if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
				4074	XXH_memcpy(state->buffer + state->bufferedSize, input, len);
				4075	state->bufferedSize += (XXH32_hash_t)len;
				4076	return XXH_OK;
				4077	}
				4078	/* total input is now > XXH3_INTERNALBUFFER_SIZE */
				4079
				4080	#define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
				4081	XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */
				4082
				4083	/*
				4084	* Internal buffer is partially filled (always, except at beginning)
				4085	* Complete it, then consume it.
				4086	*/
				4087	if (state->bufferedSize) {
				4088	size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
				4089	XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
				4090	input += loadSize;
				4091	XXH3_consumeStripes(state->acc,
				4092	&state->nbStripesSoFar, state->nbStripesPerBlock,
				4093	state->buffer, XXH3_INTERNALBUFFER_STRIPES,
				4094	secret, state->secretLimit,
				4095	f_acc512, f_scramble);
				4096	state->bufferedSize = 0;
				4097	}
				4098	XXH_ASSERT(input < bEnd);
				4099
				4100	/* Consume input by a multiple of internal buffer size */
				4101	if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) {
				4102	const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
				4103	do {
				4104	XXH3_consumeStripes(state->acc,
				4105	&state->nbStripesSoFar, state->nbStripesPerBlock,
				4106	input, XXH3_INTERNALBUFFER_STRIPES,
				4107	secret, state->secretLimit,
				4108	f_acc512, f_scramble);
				4109	input += XXH3_INTERNALBUFFER_SIZE;
				4110	} while (input<limit);
				4111	/* for last partial stripe */
				4112	memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
				4113	}
				4114	XXH_ASSERT(input < bEnd);
				4115
				4116	/* Some remaining input (always) : buffer it */
				4117	XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
				4118	state->bufferedSize = (XXH32_hash_t)(bEnd-input);
				4119	}
				4120
				4121	return XXH_OK;
				4122	}
				4123
				4124	XXH_PUBLIC_API XXH_errorcode
				4125	XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
				4126	{
				4127	return XXH3_update(state, (const xxh_u8*)input, len,
				4128	XXH3_accumulate_512, XXH3_scrambleAcc);
				4129	}
				4130
				4131
				4132	XXH_FORCE_INLINE void
				4133	XXH3_digest_long (XXH64_hash_t* acc,
				4134	const XXH3_state_t* state,
				4135	const unsigned char* secret)
				4136	{
				4137	/*
				4138	* Digest on a local copy. This way, the state remains unaltered, and it can
				4139	* continue ingesting more input afterwards.
				4140	*/
				4141	memcpy(acc, state->acc, sizeof(state->acc));
				4142	if (state->bufferedSize >= XXH_STRIPE_LEN) {
				4143	size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
				4144	size_t nbStripesSoFar = state->nbStripesSoFar;
				4145	XXH3_consumeStripes(acc,
				4146	&nbStripesSoFar, state->nbStripesPerBlock,
				4147	state->buffer, nbStripes,
				4148	secret, state->secretLimit,
				4149	XXH3_accumulate_512, XXH3_scrambleAcc);
				4150	/* last stripe */
				4151	XXH3_accumulate_512(acc,
				4152	state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
				4153	secret + state->secretLimit - XXH_SECRET_LASTACC_START);
				4154	} else { /* bufferedSize < XXH_STRIPE_LEN */
				4155	xxh_u8 lastStripe[XXH_STRIPE_LEN];
				4156	size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
				4157	XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
				4158	memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
				4159	memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
				4160	XXH3_accumulate_512(acc,
				4161	lastStripe,
				4162	secret + state->secretLimit - XXH_SECRET_LASTACC_START);
				4163	}
				4164	}
				4165
				4166	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
				4167	{
				4168	const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
				4169	if (state->totalLen > XXH3_MIDSIZE_MAX) {
				4170	XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
				4171	XXH3_digest_long(acc, state, secret);
				4172	return XXH3_mergeAccs(acc,
				4173	secret + XXH_SECRET_MERGEACCS_START,
				4174	(xxh_u64)state->totalLen * XXH_PRIME64_1);
				4175	}
				4176	/* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
				4177	if (state->seed)
				4178	return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
				4179	return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
				4180	secret, state->secretLimit + XXH_STRIPE_LEN);
				4181	}
				4182
				4183
				4184	#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
				4185
				4186	XXH_PUBLIC_API void
				4187	XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)
				4188	{
				4189	XXH_ASSERT(secretBuffer != NULL);
				4190	if (customSeedSize == 0) {
				4191	memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
				4192	return;
				4193	}
				4194	XXH_ASSERT(customSeed != NULL);
				4195
				4196	{ size_t const segmentSize = sizeof(XXH128_hash_t);
				4197	size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
				4198	XXH128_canonical_t scrambler;
				4199	XXH64_hash_t seeds[12];
				4200	size_t segnb;
				4201	XXH_ASSERT(nbSegments == 12);
				4202	XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
				4203	XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
				4204
				4205	/*
				4206	* Copy customSeed to seeds[], truncating or repeating as necessary.
				4207	*/
				4208	{ size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
				4209	size_t filled = toFill;
				4210	memcpy(seeds, customSeed, toFill);
				4211	while (filled < sizeof(seeds)) {
				4212	toFill = XXH_MIN(filled, sizeof(seeds) - filled);
				4213	memcpy((char*)seeds + filled, seeds, toFill);
				4214	filled += toFill;
				4215	} }
				4216
				4217	/* generate secret */
				4218	memcpy(secretBuffer, &scrambler, sizeof(scrambler));
				4219	for (segnb=1; segnb < nbSegments; segnb++) {
				4220	size_t const segmentStart = segnb * segmentSize;
				4221	XXH128_canonical_t segment;
				4222	XXH128_canonicalFromHash(&segment,
				4223	XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );
				4224	memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));
				4225	} }
				4226	}
				4227
				4228
				4229	/* ==========================================
				4230	* XXH3 128 bits (a.k.a XXH128)
				4231	* ==========================================
				4232	* XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
				4233	* even without counting the significantly larger output size.
				4234	*
				4235	* For example, extra steps are taken to avoid the seed-dependent collisions
				4236	* in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
				4237	*
				4238	* This strength naturally comes at the cost of some speed, especially on short
				4239	* lengths. Note that longer hashes are about as fast as the 64-bit version
				4240	* due to it using only a slight modification of the 64-bit loop.
				4241	*
				4242	* XXH128 is also more oriented towards 64-bit machines. It is still extremely
				4243	* fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
				4244	*/
				4245
				4246	XXH_FORCE_INLINE XXH128_hash_t
				4247	XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				4248	{
				4249	/* A doubled version of 1to3_64b with different constants. */
				4250	XXH_ASSERT(input != NULL);
				4251	XXH_ASSERT(1 <= len && len <= 3);
				4252	XXH_ASSERT(secret != NULL);
				4253	/*
				4254	* len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
				4255	* len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
				4256	* len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
				4257	*/
				4258	{ xxh_u8 const c1 = input[0];
				4259	xxh_u8 const c2 = input[len >> 1];
				4260	xxh_u8 const c3 = input[len - 1];
				4261	xxh_u32 const combinedl = ((xxh_u32)c1 <<16) \| ((xxh_u32)c2 << 24)
				4262	\| ((xxh_u32)c3 << 0) \| ((xxh_u32)len << 8);
				4263	xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
				4264	xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
				4265	xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
				4266	xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
				4267	xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
				4268	XXH128_hash_t h128;
				4269	h128.low64 = XXH64_avalanche(keyed_lo);
				4270	h128.high64 = XXH64_avalanche(keyed_hi);
				4271	return h128;
				4272	}
				4273	}
				4274
				4275	XXH_FORCE_INLINE XXH128_hash_t
				4276	XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				4277	{
				4278	XXH_ASSERT(input != NULL);
				4279	XXH_ASSERT(secret != NULL);
				4280	XXH_ASSERT(4 <= len && len <= 8);
				4281	seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
				4282	{ xxh_u32 const input_lo = XXH_readLE32(input);
				4283	xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
				4284	xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
				4285	xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
				4286	xxh_u64 const keyed = input_64 ^ bitflip;
				4287
				4288	/* Shift len to the left to ensure it is even, this avoids even multiplies. */
				4289	XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
				4290
				4291	m128.high64 += (m128.low64 << 1);
				4292	m128.low64 ^= (m128.high64 >> 3);
				4293
				4294	m128.low64 = XXH_xorshift64(m128.low64, 35);
				4295	m128.low64 *= 0x9FB21C651E98DF25ULL;
				4296	m128.low64 = XXH_xorshift64(m128.low64, 28);
				4297	m128.high64 = XXH3_avalanche(m128.high64);
				4298	return m128;
				4299	}
				4300	}
				4301
				4302	XXH_FORCE_INLINE XXH128_hash_t
				4303	XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				4304	{
				4305	XXH_ASSERT(input != NULL);
				4306	XXH_ASSERT(secret != NULL);
				4307	XXH_ASSERT(9 <= len && len <= 16);
				4308	{ xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
				4309	xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
				4310	xxh_u64 const input_lo = XXH_readLE64(input);
				4311	xxh_u64 input_hi = XXH_readLE64(input + len - 8);
				4312	XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
				4313	/*
				4314	* Put len in the middle of m128 to ensure that the length gets mixed to
				4315	* both the low and high bits in the 128x64 multiply below.
				4316	*/
				4317	m128.low64 += (xxh_u64)(len - 1) << 54;
				4318	input_hi ^= bitfliph;
				4319	/*
				4320	* Add the high 32 bits of input_hi to the high 32 bits of m128, then
				4321	* add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
				4322	* the high 64 bits of m128.
				4323	*
				4324	* The best approach to this operation is different on 32-bit and 64-bit.
				4325	*/
				4326	if (sizeof(void ) < sizeof(xxh_u64)) { / 32-bit */
				4327	/*
				4328	* 32-bit optimized version, which is more readable.
				4329	*
				4330	* On 32-bit, it removes an ADC and delays a dependency between the two
				4331	* halves of m128.high64, but it generates an extra mask on 64-bit.
				4332	*/
				4333	m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
				4334	} else {
				4335	/*
				4336	* 64-bit optimized (albeit more confusing) version.
				4337	*
				4338	* Uses some properties of addition and multiplication to remove the mask:
				4339	*
				4340	* Let:
				4341	* a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
				4342	* b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
				4343	* c = XXH_PRIME32_2
				4344	*
				4345	* a + (b * c)
				4346	* Inverse Property: x + y - x == y
				4347	* a + (b * (1 + c - 1))
				4348	* Distributive Property: x * (y + z) == (x * y) + (x * z)
				4349	* a + (b * 1) + (b * (c - 1))
				4350	* Identity Property: x * 1 == x
				4351	* a + b + (b * (c - 1))
				4352	*
				4353	* Substitute a, b, and c:
				4354	* input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
				4355	*
				4356	* Since input_hi.hi + input_hi.lo == input_hi, we get this:
				4357	* input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
				4358	*/
				4359	m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
				4360	}
				4361	/* m128 ^= XXH_swap64(m128 >> 64); */
				4362	m128.low64 ^= XXH_swap64(m128.high64);
				4363
				4364	{ /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
				4365	XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
				4366	h128.high64 += m128.high64 * XXH_PRIME64_2;
				4367
				4368	h128.low64 = XXH3_avalanche(h128.low64);
				4369	h128.high64 = XXH3_avalanche(h128.high64);
				4370	return h128;
				4371	} }
				4372	}
				4373
				4374	/*
				4375	* Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
				4376	*/
				4377	XXH_FORCE_INLINE XXH128_hash_t
				4378	XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				4379	{
				4380	XXH_ASSERT(len <= 16);
				4381	{ if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
				4382	if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
				4383	if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
				4384	{ XXH128_hash_t h128;
				4385	xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
				4386	xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
				4387	h128.low64 = XXH64_avalanche(seed ^ bitflipl);
				4388	h128.high64 = XXH64_avalanche( seed ^ bitfliph);
				4389	return h128;
				4390	} }
				4391	}
				4392
				4393	/*
				4394	* A bit slower than XXH3_mix16B, but handles multiply by zero better.
				4395	*/
				4396	XXH_FORCE_INLINE XXH128_hash_t
				4397	XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
				4398	const xxh_u8* secret, XXH64_hash_t seed)
				4399	{
				4400	acc.low64 += XXH3_mix16B (input_1, secret+0, seed);
				4401	acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
				4402	acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
				4403	acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
				4404	return acc;
				4405	}
				4406
				4407
				4408	XXH_FORCE_INLINE XXH128_hash_t
				4409	XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
				4410	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				4411	XXH64_hash_t seed)
				4412	{
				4413	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
				4414	XXH_ASSERT(16 < len && len <= 128);
				4415
				4416	{ XXH128_hash_t acc;
				4417	acc.low64 = len * XXH_PRIME64_1;
				4418	acc.high64 = 0;
				4419	if (len > 32) {
				4420	if (len > 64) {
				4421	if (len > 96) {
				4422	acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
				4423	}
				4424	acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
				4425	}
				4426	acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
				4427	}
				4428	acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
				4429	{ XXH128_hash_t h128;
				4430	h128.low64 = acc.low64 + acc.high64;
				4431	h128.high64 = (acc.low64 * XXH_PRIME64_1)
				4432	+ (acc.high64 * XXH_PRIME64_4)
				4433	+ ((len - seed) * XXH_PRIME64_2);
				4434	h128.low64 = XXH3_avalanche(h128.low64);
				4435	h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
				4436	return h128;
				4437	}
				4438	}
				4439	}
				4440
				4441	XXH_NO_INLINE XXH128_hash_t
				4442	XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
				4443	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				4444	XXH64_hash_t seed)
				4445	{
				4446	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
				4447	XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
				4448
				4449	{ XXH128_hash_t acc;
				4450	int const nbRounds = (int)len / 32;
				4451	int i;
				4452	acc.low64 = len * XXH_PRIME64_1;
				4453	acc.high64 = 0;
				4454	for (i=0; i<4; i++) {
				4455	acc = XXH128_mix32B(acc,
				4456	input + (32 * i),
				4457	input + (32 * i) + 16,
				4458	secret + (32 * i),
				4459	seed);
				4460	}
				4461	acc.low64 = XXH3_avalanche(acc.low64);
				4462	acc.high64 = XXH3_avalanche(acc.high64);
				4463	XXH_ASSERT(nbRounds >= 4);
				4464	for (i=4 ; i < nbRounds; i++) {
				4465	acc = XXH128_mix32B(acc,
				4466	input + (32 * i),
				4467	input + (32 * i) + 16,
				4468	secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
				4469	seed);
				4470	}
				4471	/* last bytes */
				4472	acc = XXH128_mix32B(acc,
				4473	input + len - 16,
				4474	input + len - 32,
				4475	secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
				4476	0ULL - seed);
				4477
				4478	{ XXH128_hash_t h128;
				4479	h128.low64 = acc.low64 + acc.high64;
				4480	h128.high64 = (acc.low64 * XXH_PRIME64_1)
				4481	+ (acc.high64 * XXH_PRIME64_4)
				4482	+ ((len - seed) * XXH_PRIME64_2);
				4483	h128.low64 = XXH3_avalanche(h128.low64);
				4484	h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
				4485	return h128;
				4486	}
				4487	}
				4488	}
				4489
				4490	XXH_FORCE_INLINE XXH128_hash_t
				4491	XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
				4492	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				4493	XXH3_f_accumulate_512 f_acc512,
				4494	XXH3_f_scrambleAcc f_scramble)
				4495	{
				4496	XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
				4497
				4498	XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
				4499
				4500	/* converge into final hash */
				4501	XXH_STATIC_ASSERT(sizeof(acc) == 64);
				4502	XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
				4503	{ XXH128_hash_t h128;
				4504	h128.low64 = XXH3_mergeAccs(acc,
				4505	secret + XXH_SECRET_MERGEACCS_START,
				4506	(xxh_u64)len * XXH_PRIME64_1);
				4507	h128.high64 = XXH3_mergeAccs(acc,
				4508	secret + secretSize
				4509	- sizeof(acc) - XXH_SECRET_MERGEACCS_START,
				4510	~((xxh_u64)len * XXH_PRIME64_2));
				4511	return h128;
				4512	}
				4513	}
				4514
				4515	/*
				4516	* It's important for performance that XXH3_hashLong is not inlined.
				4517	*/
				4518	XXH_NO_INLINE XXH128_hash_t
				4519	XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
				4520	XXH64_hash_t seed64,
				4521	const void* XXH_RESTRICT secret, size_t secretLen)
				4522	{
				4523	(void)seed64; (void)secret; (void)secretLen;
				4524	return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
				4525	XXH3_accumulate_512, XXH3_scrambleAcc);
				4526	}
				4527
				4528	/*
				4529	* It's important for performance that XXH3_hashLong is not inlined.
				4530	*/
				4531	XXH_NO_INLINE XXH128_hash_t
				4532	XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
				4533	XXH64_hash_t seed64,
				4534	const void* XXH_RESTRICT secret, size_t secretLen)
				4535	{
				4536	(void)seed64;
				4537	return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
				4538	XXH3_accumulate_512, XXH3_scrambleAcc);
				4539	}
				4540
				4541	XXH_FORCE_INLINE XXH128_hash_t
				4542	XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
				4543	XXH64_hash_t seed64,
				4544	XXH3_f_accumulate_512 f_acc512,
				4545	XXH3_f_scrambleAcc f_scramble,
				4546	XXH3_f_initCustomSecret f_initSec)
				4547	{
				4548	if (seed64 == 0)
				4549	return XXH3_hashLong_128b_internal(input, len,
				4550	XXH3_kSecret, sizeof(XXH3_kSecret),
				4551	f_acc512, f_scramble);
				4552	{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
				4553	f_initSec(secret, seed64);
				4554	return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
				4555	f_acc512, f_scramble);
				4556	}
				4557	}
				4558
				4559	/*
				4560	* It's important for performance that XXH3_hashLong is not inlined.
				4561	*/
				4562	XXH_NO_INLINE XXH128_hash_t
				4563	XXH3_hashLong_128b_withSeed(const void* input, size_t len,
				4564	XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
				4565	{
				4566	(void)secret; (void)secretLen;
				4567	return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
				4568	XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
				4569	}
				4570
				4571	typedef XXH128_hash_t (XXH3_hashLong128_f)(const void XXH_RESTRICT, size_t,
				4572	XXH64_hash_t, const void* XXH_RESTRICT, size_t);
				4573
				4574	XXH_FORCE_INLINE XXH128_hash_t
				4575	XXH3_128bits_internal(const void* input, size_t len,
				4576	XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
				4577	XXH3_hashLong128_f f_hl128)
				4578	{
				4579	XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
				4580	/*
				4581	* If an action is to be taken if `secret` conditions are not respected,
				4582	* it should be done here.
				4583	* For now, it's a contract pre-condition.
				4584	* Adding a check and a branch here would cost performance at every hash.
				4585	*/
				4586	if (len <= 16)
				4587	return XXH3_len_0to16_128b((const xxh_u8)input, len, (const xxh_u8)secret, seed64);
				4588	if (len <= 128)
				4589	return XXH3_len_17to128_128b((const xxh_u8)input, len, (const xxh_u8)secret, secretLen, seed64);
				4590	if (len <= XXH3_MIDSIZE_MAX)
				4591	return XXH3_len_129to240_128b((const xxh_u8)input, len, (const xxh_u8)secret, secretLen, seed64);
				4592	return f_hl128(input, len, seed64, secret, secretLen);
				4593	}
				4594
				4595
				4596	/* === Public XXH128 API === */
				4597
				4598	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
				4599	{
				4600	return XXH3_128bits_internal(input, len, 0,
				4601	XXH3_kSecret, sizeof(XXH3_kSecret),
				4602	XXH3_hashLong_128b_default);
				4603	}
				4604
				4605	XXH_PUBLIC_API XXH128_hash_t
				4606	XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
				4607	{
				4608	return XXH3_128bits_internal(input, len, 0,
				4609	(const xxh_u8*)secret, secretSize,
				4610	XXH3_hashLong_128b_withSecret);
				4611	}
				4612
				4613	XXH_PUBLIC_API XXH128_hash_t
				4614	XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
				4615	{
				4616	return XXH3_128bits_internal(input, len, seed,
				4617	XXH3_kSecret, sizeof(XXH3_kSecret),
				4618	XXH3_hashLong_128b_withSeed);
				4619	}
				4620
				4621	XXH_PUBLIC_API XXH128_hash_t
				4622	XXH128(const void* input, size_t len, XXH64_hash_t seed)
				4623	{
				4624	return XXH3_128bits_withSeed(input, len, seed);
				4625	}
				4626
				4627
				4628	/* === XXH3 128-bit streaming === */
				4629
				4630	/*
				4631	* All the functions are actually the same as for 64-bit streaming variant.
				4632	* The only difference is the finalizatiom routine.
				4633	*/
				4634
				4635	static void
				4636	XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
				4637	XXH64_hash_t seed,
				4638	const void* secret, size_t secretSize)
				4639	{
				4640	XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
				4641	}
				4642
				4643	XXH_PUBLIC_API XXH_errorcode
				4644	XXH3_128bits_reset(XXH3_state_t* statePtr)
				4645	{
				4646	if (statePtr == NULL) return XXH_ERROR;
				4647	XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
				4648	return XXH_OK;
				4649	}
				4650
				4651	XXH_PUBLIC_API XXH_errorcode
				4652	XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
				4653	{
				4654	if (statePtr == NULL) return XXH_ERROR;
				4655	XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
				4656	if (secret == NULL) return XXH_ERROR;
				4657	if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
				4658	return XXH_OK;
				4659	}
				4660
				4661	XXH_PUBLIC_API XXH_errorcode
				4662	XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
				4663	{
				4664	if (statePtr == NULL) return XXH_ERROR;
				4665	if (seed==0) return XXH3_128bits_reset(statePtr);
				4666	if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
				4667	XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
				4668	return XXH_OK;
				4669	}
				4670
				4671	XXH_PUBLIC_API XXH_errorcode
				4672	XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
				4673	{
				4674	return XXH3_update(state, (const xxh_u8*)input, len,
				4675	XXH3_accumulate_512, XXH3_scrambleAcc);
				4676	}
				4677
				4678	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
				4679	{
				4680	const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
				4681	if (state->totalLen > XXH3_MIDSIZE_MAX) {
				4682	XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
				4683	XXH3_digest_long(acc, state, secret);
				4684	XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
				4685	{ XXH128_hash_t h128;
				4686	h128.low64 = XXH3_mergeAccs(acc,
				4687	secret + XXH_SECRET_MERGEACCS_START,
				4688	(xxh_u64)state->totalLen * XXH_PRIME64_1);
				4689	h128.high64 = XXH3_mergeAccs(acc,
				4690	secret + state->secretLimit + XXH_STRIPE_LEN
				4691	- sizeof(acc) - XXH_SECRET_MERGEACCS_START,
				4692	~((xxh_u64)state->totalLen * XXH_PRIME64_2));
				4693	return h128;
				4694	}
				4695	}
				4696	/* len <= XXH3_MIDSIZE_MAX : short code */
				4697	if (state->seed)
				4698	return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
				4699	return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
				4700	secret, state->secretLimit + XXH_STRIPE_LEN);
				4701	}
				4702
				4703	/* 128-bit utility functions */
				4704
				4705	#include <string.h> /* memcmp, memcpy */
				4706
				4707	/* return : 1 is equal, 0 if different */
				4708	XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
				4709	{
				4710	/* note : XXH128_hash_t is compact, it has no padding byte */
				4711	return !(memcmp(&h1, &h2, sizeof(h1)));
				4712	}
				4713
				4714	/* This prototype is compatible with stdlib's qsort().
				4715	* return : >0 if h128_1 > h128_2
				4716	* <0 if h128_1 < h128_2
				4717	* =0 if h128_1 == h128_2 */
				4718	XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
				4719	{
				4720	XXH128_hash_t const h1 = (const XXH128_hash_t)h128_1;
				4721	XXH128_hash_t const h2 = (const XXH128_hash_t)h128_2;
				4722	int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
				4723	/* note : bets that, in most cases, hash values are different */
				4724	if (hcmp) return hcmp;
				4725	return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
				4726	}
				4727
				4728
				4729	/====== Canonical representation ======/
				4730	XXH_PUBLIC_API void
				4731	XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
				4732	{
				4733	XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
				4734	if (XXH_CPU_LITTLE_ENDIAN) {
				4735	hash.high64 = XXH_swap64(hash.high64);
				4736	hash.low64 = XXH_swap64(hash.low64);
				4737	}
				4738	memcpy(dst, &hash.high64, sizeof(hash.high64));
				4739	memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
				4740	}
				4741
				4742	XXH_PUBLIC_API XXH128_hash_t
				4743	XXH128_hashFromCanonical(const XXH128_canonical_t* src)
				4744	{
				4745	XXH128_hash_t h;
				4746	h.high64 = XXH_readBE64(src);
				4747	h.low64 = XXH_readBE64(src->digest + 8);
				4748	return h;
				4749	}
				4750
				4751	/* Pop our optimization override from above */
				4752	#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
				4753	&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
				4754	&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
				4755	# pragma GCC pop_options
				4756	#endif
				4757
				4758	#endif /* XXH_NO_LONG_LONG */
				4759
				4760
				4761	#endif /* XXH_IMPLEMENTATION */
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	4762
				4763
				4764	#if defined (__cplusplus)
				4765	}
				4766	#endif