Blame - include/import/xxhash.h - haproxy

blob: 24326bc33f44cb68012249f7233267aa2b6ca4c3 [file] [log] [blame]

Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1	/*
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	2	* xxHash - Extremely Fast Hash algorithm
				3	* Header File
				4	* Copyright (C) 2012-2020 Yann Collet
				5	*
				6	* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
				7	*
				8	* Redistribution and use in source and binary forms, with or without
				9	* modification, are permitted provided that the following conditions are
				10	* met:
				11	*
				12	* * Redistributions of source code must retain the above copyright
				13	* notice, this list of conditions and the following disclaimer.
				14	* * Redistributions in binary form must reproduce the above
				15	* copyright notice, this list of conditions and the following disclaimer
				16	* in the documentation and/or other materials provided with the
				17	* distribution.
				18	*
				19	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30	*
				31	* You can contact the author at:
				32	* - xxHash homepage: https://www.xxhash.com
				33	* - xxHash source repository: https://github.com/Cyan4973/xxHash
				34	*/
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	35
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	36	/* TODO: update */
				37	/* Notice extracted from xxHash homepage:
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	38
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	39	xxHash is an extremely fast hash algorithm, running at RAM speed limits.
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	40	It also successfully passes all tests from the SMHasher suite.
				41
				42	Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
				43
				44	Name Speed Q.Score Author
				45	xxHash 5.4 GB/s 10
				46	CrapWow 3.2 GB/s 2 Andrew
				47	MumurHash 3a 2.7 GB/s 10 Austin Appleby
				48	SpookyHash 2.0 GB/s 10 Bob Jenkins
				49	SBox 1.4 GB/s 9 Bret Mulvey
				50	Lookup3 1.2 GB/s 9 Bob Jenkins
				51	SuperFastHash 1.2 GB/s 1 Paul Hsieh
				52	CityHash64 1.05 GB/s 10 Pike & Alakuijala
				53	FNV 0.55 GB/s 5 Fowler, Noll, Vo
				54	CRC32 0.43 GB/s 9
				55	MD5-32 0.33 GB/s 10 Ronald L. Rivest
				56	SHA1-32 0.28 GB/s 10
				57
				58	Q.Score is a measure of quality of the hash function.
				59	It depends on successfully passing SMHasher test set.
				60	10 is a perfect score.
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	61
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	62	Note: SMHasher's CRC32 implementation is not the fastest one.
				63	Other speed-oriented implementations can be faster,
				64	especially in combination with PCLMUL instruction:
				65	https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
				66
				67	A 64-bit version, named XXH64, is available since r35.
				68	It offers much better speed, but for 64-bit applications only.
				69	Name Speed on 64 bits Speed on 32 bits
				70	XXH64 13.8 GB/s 1.9 GB/s
				71	XXH32 6.8 GB/s 6.0 GB/s
				72	*/
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	73
				74	#if defined (__cplusplus)
				75	extern "C" {
				76	#endif
				77
Dragan Dosen	6f7cc11	2020-12-22 14:46:47 +0100	[diff] [blame]	78	#include <haproxy/defaults.h>
				79
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	80	/* ****************************
				81	* INLINE mode
				82	******************************/
				83	/*!
				84	* XXH_INLINE_ALL (and XXH_PRIVATE_API)
				85	* Use these build macros to inline xxhash into the target unit.
				86	* Inlining improves performance on small inputs, especially when the length is
				87	* expressed as a compile-time constant:
				88	*
				89	* https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
				90	*
				91	* It also keeps xxHash symbols private to the unit, so they are not exported.
				92	*
				93	* Usage:
				94	* #define XXH_INLINE_ALL
				95	* #include "xxhash.h"
				96	*
				97	* Do not compile and link xxhash.o as a separate object, as it is not useful.
				98	*/
				99	#if (defined(XXH_INLINE_ALL) \|\| defined(XXH_PRIVATE_API)) \
				100	&& !defined(XXH_INLINE_ALL_31684351384)
				101	/* this section should be traversed only once */
				102	# define XXH_INLINE_ALL_31684351384
				103	/* give access to the advanced API, required to compile implementations */
				104	# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */
				105	# define XXH_STATIC_LINKING_ONLY
				106	/* make all functions private */
				107	# undef XXH_PUBLIC_API
				108	# if defined(__GNUC__)
				109	# define XXH_PUBLIC_API static __inline __attribute__((unused))
				110	# elif defined (__cplusplus) \|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
				111	# define XXH_PUBLIC_API static inline
				112	# elif defined(_MSC_VER)
				113	# define XXH_PUBLIC_API static __inline
				114	# else
				115	/* note: this version may generate warnings for unused static functions */
				116	# define XXH_PUBLIC_API static
				117	# endif
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	118
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	119	/*
				120	* This part deals with the special case where a unit wants to inline xxHash,
				121	* but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
				122	* as part of some previously included *.h header file.
				123	* Without further action, the new include would just be ignored,
				124	* and functions would effectively _not_ be inlined (silent failure).
				125	* The following macros solve this situation by prefixing all inlined names,
				126	* avoiding naming collision with previous inclusions.
				127	*/
				128	# ifdef XXH_NAMESPACE
				129	# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
				130	/*
				131	* Note: Alternative: #undef all symbols (it's a pretty large list).
				132	* Without #error: it compiles, but functions are actually not inlined.
				133	*/
				134	# endif
				135	# define XXH_NAMESPACE XXH_INLINE_
				136	/*
				137	* Some identifiers (enums, type names) are not symbols, but they must
				138	* still be renamed to avoid redeclaration.
				139	* Alternative solution: do not redeclare them.
				140	* However, this requires some #ifdefs, and is a more dispersed action.
				141	* Meanwhile, renaming can be achieved in a single block
				142	*/
				143	# define XXH_IPREF(Id) XXH_INLINE_ ## Id
				144	# define XXH_OK XXH_IPREF(XXH_OK)
				145	# define XXH_ERROR XXH_IPREF(XXH_ERROR)
				146	# define XXH_errorcode XXH_IPREF(XXH_errorcode)
				147	# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)
				148	# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)
				149	# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
				150	# define XXH32_state_s XXH_IPREF(XXH32_state_s)
				151	# define XXH32_state_t XXH_IPREF(XXH32_state_t)
				152	# define XXH64_state_s XXH_IPREF(XXH64_state_s)
				153	# define XXH64_state_t XXH_IPREF(XXH64_state_t)
				154	# define XXH3_state_s XXH_IPREF(XXH3_state_s)
				155	# define XXH3_state_t XXH_IPREF(XXH3_state_t)
				156	# define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
				157	/* Ensure the header is parsed again, even if it was previously included */
				158	# undef XXHASH_H_5627135585666179
				159	# undef XXHASH_H_STATIC_13879238742
				160	#endif /* XXH_INLINE_ALL \|\| XXH_PRIVATE_API */
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	161
				162
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	163
				164	/* ****************************************************************
				165	* Stable API
				166	*****************************************************************/
				167	#ifndef XXHASH_H_5627135585666179
				168	#define XXHASH_H_5627135585666179 1
				169
				170	/* specific declaration modes for Windows */
				171	#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
				172	# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) \|\| defined(XXH_EXPORT))
				173	# ifdef XXH_EXPORT
				174	# define XXH_PUBLIC_API __declspec(dllexport)
				175	# elif XXH_IMPORT
				176	# define XXH_PUBLIC_API __declspec(dllimport)
				177	# endif
				178	# else
				179	# define XXH_PUBLIC_API /* do nothing */
				180	# endif
				181	#endif
				182
				183	/*!
				184	* XXH_NAMESPACE, aka Namespace Emulation:
				185	*
				186	* If you want to include _and expose_ xxHash functions from within your own
				187	* library, but also want to avoid symbol collisions with other libraries which
				188	* may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
				189	* any public symbol from xxhash library with the value of XXH_NAMESPACE
				190	* (therefore, avoid empty or numeric values).
				191	*
				192	* Note that no change is required within the calling program as long as it
				193	* includes `xxhash.h`: Regular symbol names will be automatically translated
				194	* by this header.
				195	*/
				196	#ifdef XXH_NAMESPACE
				197	# define XXH_CAT(A,B) A##B
				198	# define XXH_NAME2(A,B) XXH_CAT(A,B)
				199	# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
				200	/* XXH32 */
				201	# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
				202	# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
				203	# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
				204	# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
				205	# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
				206	# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
				207	# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
				208	# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
				209	# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
				210	/* XXH64 */
				211	# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
				212	# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
				213	# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
				214	# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
				215	# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
				216	# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
				217	# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
				218	# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
				219	# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
				220	/* XXH3_64bits */
				221	# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
				222	# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
				223	# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
				224	# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
				225	# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
				226	# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
				227	# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
				228	# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
				229	# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
				230	# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
				231	# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
				232	# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
				233	/* XXH3_128bits */
				234	# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
				235	# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
				236	# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
				237	# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
				238	# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
				239	# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
				240	# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
				241	# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
				242	# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
				243	# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
				244	# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
				245	# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
				246	# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
				247	#endif
				248
				249
				250	/* *************************************
				251	* Version
				252	***************************************/
				253	#define XXH_VERSION_MAJOR 0
				254	#define XXH_VERSION_MINOR 8
				255	#define XXH_VERSION_RELEASE 0
				256	#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR 100100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
				257	XXH_PUBLIC_API unsigned XXH_versionNumber (void);
				258
				259
				260	/* ****************************
				261	* Definitions
				262	******************************/
				263	#include <stddef.h> /* size_t */
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	264	typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
				265
				266
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	267	/-*********************************************************************
				268	* 32-bit hash
				269	************************************************************************/
				270	#if !defined (__VMS) \
				271	&& (defined (__cplusplus) \
				272	\|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
				273	# include <stdint.h>
				274	typedef uint32_t XXH32_hash_t;
				275	#else
				276	# include <limits.h>
				277	# if UINT_MAX == 0xFFFFFFFFUL
				278	typedef unsigned int XXH32_hash_t;
				279	# else
				280	# if ULONG_MAX == 0xFFFFFFFFUL
				281	typedef unsigned long XXH32_hash_t;
				282	# else
				283	# error "unsupported platform: need a 32-bit type"
				284	# endif
				285	# endif
				286	#endif
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	287
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	288	/*!
				289	* XXH32():
				290	* Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
				291	* The memory between input & input+length must be valid (allocated and read-accessible).
				292	* "seed" can be used to alter the result predictably.
				293	* Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
				294	*
				295	* Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
				296	* and offers true 64/128 bit hash results. It provides a superior level of
				297	* dispersion, and greatly reduces the risks of collisions.
				298	*/
				299	XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	300
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	301	/***** Streaming *****/
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	302
				303	/*
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	304	* Streaming functions generate the xxHash value from an incrememtal input.
				305	* This method is slower than single-call functions, due to state management.
				306	* For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
				307	*
				308	* An XXH state must first be allocated using `XXH*_createState()`.
				309	*
				310	* Start a new hash by initializing the state with a seed using `XXH*_reset()`.
				311	*
				312	* Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
				313	*
				314	* The function returns an error code, with 0 meaning OK, and any other value
				315	* meaning there is an error.
				316	*
				317	* Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
				318	* This function returns the nn-bits hash as an int or long long.
				319	*
				320	* It's still possible to continue inserting input into the hash state after a
				321	* digest, and generate new hash values later on by invoking `XXH*_digest()`.
				322	*
				323	* When done, release the state using `XXH*_freeState()`.
				324	*/
				325
				326	typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */
				327	XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
				328	XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);
				329	XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
				330
				331	XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed);
				332	XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
				333	XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
				334
				335	/***** Canonical representation *****/
				336
				337	/*
				338	* The default return values from XXH functions are unsigned 32 and 64 bit
				339	* integers.
				340	* This the simplest and fastest format for further post-processing.
				341	*
				342	* However, this leaves open the question of what is the order on the byte level,
				343	* since little and big endian conventions will store the same number differently.
				344	*
				345	* The canonical representation settles this issue by mandating big-endian
				346	* convention, the same convention as human-readable numbers (large digits first).
				347	*
				348	* When writing hash values to storage, sending them over a network, or printing
				349	* them, it's highly recommended to use the canonical representation to ensure
				350	* portability across a wider range of systems, present and future.
				351	*
				352	* The following functions allow transformation of hash values to and from
				353	* canonical format.
				354	*/
				355
				356	typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
				357	XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
				358	XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
				359
				360
				361	#ifndef XXH_NO_LONG_LONG
				362	/-*********************************************************************
				363	* 64-bit hash
				364	************************************************************************/
				365	#if !defined (__VMS) \
				366	&& (defined (__cplusplus) \
				367	\|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
				368	# include <stdint.h>
				369	typedef uint64_t XXH64_hash_t;
				370	#else
				371	/* the following type must have a width of 64-bit */
				372	typedef unsigned long long XXH64_hash_t;
				373	#endif
				374
				375	/*!
				376	* XXH64():
				377	* Returns the 64-bit hash of sequence of length @length stored at memory
				378	* address @input.
				379	* @seed can be used to alter the result predictably.
				380	*
				381	* This function usually runs faster on 64-bit systems, but slower on 32-bit
				382	* systems (see benchmark).
				383	*
				384	* Note: XXH3 provides competitive speed for both 32-bit and 64-bit systems,
				385	* and offers true 64/128 bit hash results. It provides a superior level of
				386	* dispersion, and greatly reduces the risks of collisions.
				387	*/
				388	XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
				389
				390	/***** Streaming *****/
				391	typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
				392	XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
				393	XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
				394	XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
				395
				396	XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);
				397	XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
				398	XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
				399
				400	/***** Canonical representation *****/
				401	typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
				402	XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
				403	XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
				404
				405
				406	/-*********************************************************************
				407	* XXH3 64-bit variant
				408	************************************************************************/
				409
				410	/* ************************************************************************
				411	* XXH3 is a new hash algorithm featuring:
				412	* - Improved speed for both small and large inputs
				413	* - True 64-bit and 128-bit outputs
				414	* - SIMD acceleration
				415	* - Improved 32-bit viability
				416	*
				417	* Speed analysis methodology is explained here:
				418	*
				419	* https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
				420	*
				421	* In general, expect XXH3 to run about ~2x faster on large inputs and >3x
				422	* faster on small ones compared to XXH64, though exact differences depend on
				423	* the platform.
				424	*
				425	* The algorithm is portable: Like XXH32 and XXH64, it generates the same hash
				426	* on all platforms.
				427	*
				428	* It benefits greatly from SIMD and 64-bit arithmetic, but does not require it.
				429	*
				430	* Almost all 32-bit and 64-bit targets that can run XXH32 smoothly can run
				431	* XXH3 at competitive speeds, even if XXH64 runs slowly. Further details are
				432	* explained in the implementation.
				433	*
				434	* Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
				435	* ZVector and scalar targets. This can be controlled with the XXH_VECTOR macro.
				436	*
				437	* XXH3 offers 2 variants, _64bits and _128bits.
				438	* When only 64 bits are needed, prefer calling the _64bits variant, as it
				439	* reduces the amount of mixing, resulting in faster speed on small inputs.
				440	*
				441	* It's also generally simpler to manipulate a scalar return type than a struct.
				442	*
				443	* The 128-bit version adds additional strength, but it is slightly slower.
				444	*
				445	* The XXH3 algorithm is still in development.
				446	* The results it produces may still change in future versions.
				447	*
				448	* Results produced by v0.7.x are not comparable with results from v0.7.y.
				449	* However, the API is completely stable, and it can safely be used for
				450	* ephemeral data (local sessions).
				451	*
				452	* Avoid storing values in long-term storage until the algorithm is finalized.
				453	* XXH3's return values will be officially finalized upon reaching v0.8.0.
				454	*
				455	* After which, return values of XXH3 and XXH128 will no longer change in
				456	* future versions.
				457	*
				458	* The API supports one-shot hashing, streaming mode, and custom secrets.
				459	*/
				460
				461	/* XXH3_64bits():
				462	* default 64-bit variant, using default secret and default seed of 0.
				463	* It's the fastest variant. */
				464	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
				465
				466	/*
				467	* XXH3_64bits_withSeed():
				468	* This variant generates a custom secret on the fly
				469	* based on default secret altered using the `seed` value.
				470	* While this operation is decently fast, note that it's not completely free.
				471	* Note: seed==0 produces the same results as XXH3_64bits().
				472	*/
				473	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
				474
				475	/*
				476	* XXH3_64bits_withSecret():
				477	* It's possible to provide any blob of bytes as a "secret" to generate the hash.
				478	* This makes it more difficult for an external actor to prepare an intentional collision.
				479	* The main condition is that secretSize must be large enough (>= XXH3_SECRET_SIZE_MIN).
				480	* However, the quality of produced hash values depends on secret's entropy.
				481	* Technically, the secret must look like a bunch of random bytes.
				482	* Avoid "trivial" or structured data such as repeated sequences or a text document.
				483	* Whenever unsure about the "randomness" of the blob of bytes,
				484	* consider relabelling it as a "custom seed" instead,
				485	* and employ "XXH3_generateSecret()" (see below)
				486	* to generate a high entropy secret derived from the custom seed.
				487	*/
				488	#define XXH3_SECRET_SIZE_MIN 136
				489	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
				490
				491
				492	/***** Streaming *****/
				493	/*
				494	* Streaming requires state maintenance.
				495	* This operation costs memory and CPU.
				496	* As a consequence, streaming is slower than one-shot hashing.
				497	* For better performance, prefer one-shot functions whenever applicable.
				498	*/
				499	typedef struct XXH3_state_s XXH3_state_t;
				500	XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
				501	XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
				502	XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
				503
				504	/*
				505	* XXH3_64bits_reset():
				506	* Initialize with default parameters.
				507	* digest will be equivalent to `XXH3_64bits()`.
				508	*/
				509	XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
				510	/*
				511	* XXH3_64bits_reset_withSeed():
				512	* Generate a custom secret from `seed`, and store it into `statePtr`.
				513	* digest will be equivalent to `XXH3_64bits_withSeed()`.
				514	*/
				515	XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
				516	/*
				517	* XXH3_64bits_reset_withSecret():
				518	* `secret` is referenced, it _must outlive_ the hash streaming session.
				519	* Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
				520	* and the quality of produced hash values depends on secret's entropy
				521	* (secret's content should look like a bunch of random bytes).
				522	* When in doubt about the randomness of a candidate `secret`,
				523	* consider employing `XXH3_generateSecret()` instead (see below).
				524	*/
				525	XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
				526
				527	XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
				528	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);
				529
				530	/* note : canonical representation of XXH3 is the same as XXH64
				531	* since they both produce XXH64_hash_t values */
				532
				533
				534	/-*********************************************************************
				535	* XXH3 128-bit variant
				536	************************************************************************/
				537
				538	typedef struct {
				539	XXH64_hash_t low64;
				540	XXH64_hash_t high64;
				541	} XXH128_hash_t;
				542
				543	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
				544	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
				545	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
				546
				547	/***** Streaming *****/
				548	/*
				549	* Streaming requires state maintenance.
				550	* This operation costs memory and CPU.
				551	* As a consequence, streaming is slower than one-shot hashing.
				552	* For better performance, prefer one-shot functions whenever applicable.
				553	*
				554	* XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
				555	* Use already declared XXH3_createState() and XXH3_freeState().
				556	*
				557	* All reset and streaming functions have same meaning as their 64-bit counterpart.
				558	*/
				559
				560	XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
				561	XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
				562	XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
				563
				564	XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
				565	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
				566
				567	/* Following helper functions make it possible to compare XXH128_hast_t values.
				568	* Since XXH128_hash_t is a structure, this capability is not offered by the language.
				569	* Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
				570
				571	/*!
				572	* XXH128_isEqual():
				573	* Return: 1 if `h1` and `h2` are equal, 0 if they are not.
				574	*/
				575	XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
				576
				577	/*!
				578	* XXH128_cmp():
				579	*
				580	* This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
				581	*
				582	* return: >0 if h128_1 > h128_2
				583	* =0 if h128_1 == h128_2
				584	* <0 if h128_1 < h128_2
				585	*/
				586	XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
				587
				588
				589	/***** Canonical representation *****/
				590	typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
				591	XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
				592	XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
				593
				594
				595	#endif /* XXH_NO_LONG_LONG */
				596
				597	#endif /* XXHASH_H_5627135585666179 */
				598
				599
				600
				601	#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
				602	#define XXHASH_H_STATIC_13879238742
				603	/* ****************************************************************************
				604	* This section contains declarations which are not guaranteed to remain stable.
				605	* They may change in future versions, becoming incompatible with a different
				606	* version of the library.
				607	* These declarations should only be used with static linking.
				608	* Never use them in association with dynamic linking!
				609	***************************************************************************** */
				610
				611	/*
				612	* These definitions are only present to allow static allocation
				613	* of XXH states, on stack or in a struct, for example.
				614	* Never ever access their members directly.
				615	*/
				616
				617	struct XXH32_state_s {
				618	XXH32_hash_t total_len_32;
				619	XXH32_hash_t large_len;
				620	XXH32_hash_t v1;
				621	XXH32_hash_t v2;
				622	XXH32_hash_t v3;
				623	XXH32_hash_t v4;
				624	XXH32_hash_t mem32[4];
				625	XXH32_hash_t memsize;
				626	XXH32_hash_t reserved; /* never read nor write, might be removed in a future version */
				627	}; /* typedef'd to XXH32_state_t */
				628
				629
				630	#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */
				631
				632	struct XXH64_state_s {
				633	XXH64_hash_t total_len;
				634	XXH64_hash_t v1;
				635	XXH64_hash_t v2;
				636	XXH64_hash_t v3;
				637	XXH64_hash_t v4;
				638	XXH64_hash_t mem64[4];
				639	XXH32_hash_t memsize;
				640	XXH32_hash_t reserved32; /* required for padding anyway */
				641	XXH64_hash_t reserved64; /* never read nor write, might be removed in a future version */
				642	}; /* typedef'd to XXH64_state_t */
				643
				644	#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */
				645	# include <stdalign.h>
				646	# define XXH_ALIGN(n) alignas(n)
				647	#elif defined(__GNUC__)
				648	# define XXH_ALIGN(n) __attribute__ ((aligned(n)))
				649	#elif defined(_MSC_VER)
				650	# define XXH_ALIGN(n) __declspec(align(n))
				651	#else
				652	# define XXH_ALIGN(n) /* disabled */
				653	#endif
				654
				655	/* Old GCC versions only accept the attribute after the type in structures. */
				656	#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
				657	&& defined(__GNUC__)
				658	# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
				659	#else
				660	# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
				661	#endif
				662
				663	#define XXH3_INTERNALBUFFER_SIZE 256
				664	#define XXH3_SECRET_DEFAULT_SIZE 192
				665	struct XXH3_state_s {
				666	XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
				667	/* used to store a custom secret generated from a seed */
				668	XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
				669	XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
				670	XXH32_hash_t bufferedSize;
				671	XXH32_hash_t reserved32;
				672	size_t nbStripesSoFar;
				673	XXH64_hash_t totalLen;
				674	size_t nbStripesPerBlock;
				675	size_t secretLimit;
				676	XXH64_hash_t seed;
				677	XXH64_hash_t reserved64;
				678	const unsigned char* extSecret; /* reference to external secret;
				679	* if == NULL, use .customSecret instead */
				680	/* note: there may be some padding at the end due to alignment on 64 bytes */
				681	}; /* typedef'd to XXH3_state_t */
				682
				683	#undef XXH_ALIGN_MEMBER
				684
				685	/* When the XXH3_state_t structure is merely emplaced on stack,
				686	* it should be initialized with XXH3_INITSTATE() or a memset()
				687	* in case its first reset uses XXH3_NNbits_reset_withSeed().
				688	* This init can be omitted if the first reset uses default or _withSecret mode.
				689	* This operation isn't necessary when the state is created with XXH3_createState().
				690	* Note that this doesn't prepare the state for a streaming operation,
				691	* it's still necessary to use XXH3_NNbits_reset*() afterwards.
				692	*/
				693	#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }
				694
				695
				696	/* === Experimental API === */
				697	/* Symbols defined below must be considered tied to a specific library version. */
				698
				699	/*
				700	* XXH3_generateSecret():
				701	*
				702	* Derive a high-entropy secret from any user-defined content, named customSeed.
				703	* The generated secret can be used in combination with `*_withSecret()` functions.
				704	* The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
				705	* as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
				706	*
				707	* The function accepts as input a custom seed of any length and any content,
				708	* and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
				709	* into an already allocated buffer secretBuffer.
				710	* The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
				711	*
				712	* The generated secret can then be used with any `*_withSecret()` variant.
				713	* Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
				714	* `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
				715	* are part of this list. They all accept a `secret` parameter
				716	* which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
				717	* _and_ feature very high entropy (consist of random-looking bytes).
				718	* These conditions can be a high bar to meet, so
				719	* this function can be used to generate a secret of proper quality.
				720	*
				721	* customSeed can be anything. It can have any size, even small ones,
				722	* and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.
				723	* The resulting `secret` will nonetheless provide all expected qualities.
				724	*
				725	* Supplying NULL as the customSeed copies the default secret into `secretBuffer`.
				726	* When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
				727	*/
				728	XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);
				729
				730
				731	/* simple short-cut to pre-selected XXH3_128bits variant */
				732	XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
				733
				734
				735	#endif /* XXH_NO_LONG_LONG */
				736
				737
				738	#if defined(XXH_INLINE_ALL) \|\| defined(XXH_PRIVATE_API)
				739	# define XXH_IMPLEMENTATION
				740	#endif
				741
				742	#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
				743
				744
				745	/* ======================================================================== */
				746	/* ======================================================================== */
				747	/* ======================================================================== */
				748
				749
				750	/-*********************************************************************
				751	* xxHash implementation
				752	-*********************************************************************
				753	* xxHash's implementation used to be hosted inside xxhash.c.
				754	*
				755	* However, inlining requires implementation to be visible to the compiler,
				756	* hence be included alongside the header.
				757	* Previously, implementation was hosted inside xxhash.c,
				758	* which was then #included when inlining was activated.
				759	* This construction created issues with a few build and install systems,
				760	* as it required xxhash.c to be stored in /include directory.
				761	*
				762	* xxHash implementation is now directly integrated within xxhash.h.
				763	* As a consequence, xxhash.c is no longer needed in /include.
				764	*
				765	* xxhash.c is still available and is still useful.
				766	* In a "normal" setup, when xxhash is not inlined,
				767	* xxhash.h only exposes the prototypes and public symbols,
				768	* while xxhash.c can be built into an object file xxhash.o
				769	* which can then be linked into the final binary.
				770	************************************************************************/
				771
				772	#if ( defined(XXH_INLINE_ALL) \|\| defined(XXH_PRIVATE_API) \
				773	\|\| defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
				774	# define XXH_IMPLEM_13a8737387
				775
				776	/* *************************************
				777	* Tuning parameters
				778	***************************************/
				779	/*!
				780	* XXH_FORCE_MEMORY_ACCESS:
				781	* By default, access to unaligned memory is controlled by `memcpy()`, which is
				782	* safe and portable.
				783	*
				784	* Unfortunately, on some target/compiler combinations, the generated assembly
				785	* is sub-optimal.
				786	*
				787	* The below switch allow selection of a different access method
				788	* in the search for improved performance.
				789	* Method 0 (default):
				790	* Use `memcpy()`. Safe and portable. Default.
				791	* Method 1:
				792	* `__attribute__((packed))` statement. It depends on compiler extensions
				793	* and is therefore not portable.
Willy Tarreau	4acb99f	2021-02-04 17:02:39 +0100	[diff] [blame]	794	* This method is safe _if_ your compiler supports it,
				795	* and generally as fast or faster than `memcpy`.
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	796	* Method 2:
				797	* Direct access via cast. This method doesn't depend on the compiler but
				798	* violates the C standard.
				799	* It can generate buggy code on targets which do not support unaligned
				800	* memory accesses.
				801	* But in some circumstances, it's the only known way to get the most
Willy Tarreau	4acb99f	2021-02-04 17:02:39 +0100	[diff] [blame]	802	* performance.
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	803	* Method 3:
				804	* Byteshift. This can generate the best code on old compilers which don't
				805	* inline small `memcpy()` calls, and it might also be faster on big-endian
				806	* systems which lack a native byteswap instruction.
				807	* See https://stackoverflow.com/a/32095106/646947 for details.
				808	* Prefer these methods in priority order (0 > 1 > 2 > 3)
				809	*/
				810	#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
Willy Tarreau	4acb99f	2021-02-04 17:02:39 +0100	[diff] [blame]	811	/* prefer __packed__ structures (method 1) for gcc on armv7 and armv8 */
				812	# if !defined(__clang__) && ( \
				813	(defined(__INTEL_COMPILER) && !defined(_WIN32)) \|\| \
				814	(defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)) )
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	815	# define XXH_FORCE_MEMORY_ACCESS 1
				816	# endif
				817	#endif
				818
				819	/*!
				820	* XXH_ACCEPT_NULL_INPUT_POINTER:
				821	* If the input pointer is NULL, xxHash's default behavior is to dereference it,
				822	* triggering a segfault.
				823	* When this macro is enabled, xxHash actively checks the input for a null pointer.
				824	* If it is, the result for null input pointers is the same as a zero-length input.
				825	*/
				826	#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */
				827	# define XXH_ACCEPT_NULL_INPUT_POINTER 0
				828	#endif
				829
				830	/*!
				831	* XXH_FORCE_ALIGN_CHECK:
				832	* This is an important performance trick
				833	* for architectures without decent unaligned memory access performance.
				834	* It checks for input alignment, and when conditions are met,
				835	* uses a "fast path" employing direct 32-bit/64-bit read,
				836	* resulting in _dramatically faster_ read speed.
				837	*
				838	* The check costs one initial branch per hash, which is generally negligible, but not zero.
				839	* Moreover, it's not useful to generate binary for an additional code path
Ilya Shipitsin	1e9a666	2021-01-05 22:10:46 +0500	[diff] [blame]	840	* if memory access uses same instruction for both aligned and unaligned addresses.
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	841	*
				842	* In these cases, the alignment check can be removed by setting this macro to 0.
				843	* Then the code will always use unaligned memory access.
				844	* Align check is automatically disabled on x86, x64 & arm64,
				845	* which are platforms known to offer good unaligned memory accesses performance.
				846	*
				847	* This option does not affect XXH3 (only XXH32 and XXH64).
				848	*/
				849	#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
				850	# if defined(__i386) \|\| defined(__x86_64__) \|\| defined(__aarch64__) \
				851	\|\| defined(_M_IX86) \|\| defined(_M_X64) \|\| defined(_M_ARM64) /* visual */
				852	# define XXH_FORCE_ALIGN_CHECK 0
				853	# else
				854	# define XXH_FORCE_ALIGN_CHECK 1
				855	# endif
				856	#endif
				857
				858	/*!
				859	* XXH_NO_INLINE_HINTS:
				860	*
				861	* By default, xxHash tries to force the compiler to inline almost all internal
				862	* functions.
				863	*
				864	* This can usually improve performance due to reduced jumping and improved
				865	* constant folding, but significantly increases the size of the binary which
				866	* might not be favorable.
				867	*
				868	* Additionally, sometimes the forced inlining can be detrimental to performance,
				869	* depending on the architecture.
				870	*
				871	* XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
				872	* compiler full control on whether to inline or not.
				873	*
				874	* When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
				875	* -fno-inline with GCC or Clang, this will automatically be defined.
				876	*/
				877	#ifndef XXH_NO_INLINE_HINTS
				878	# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
				879	\|\| defined(__NO_INLINE__) /* -O0, -fno-inline */
				880	# define XXH_NO_INLINE_HINTS 1
				881	# else
				882	# define XXH_NO_INLINE_HINTS 0
				883	# endif
				884	#endif
				885
				886	/*!
				887	* XXH_REROLL:
				888	* Whether to reroll XXH32_finalize, and XXH64_finalize,
				889	* instead of using an unrolled jump table/if statement loop.
				890	*
				891	* This is automatically defined on -Os/-Oz on GCC and Clang.
				892	*/
				893	#ifndef XXH_REROLL
				894	# if defined(__OPTIMIZE_SIZE__)
				895	# define XXH_REROLL 1
				896	# else
				897	# define XXH_REROLL 0
				898	# endif
				899	#endif
				900
				901
				902	/* *************************************
				903	* Includes & Memory related functions
				904	***************************************/
				905	/*!
				906	* Modify the local functions below should you wish to use
				907	* different memory routines for malloc() and free()
				908	*/
				909	#include <stdlib.h>
				910
				911	static void* XXH_malloc(size_t s) { return malloc(s); }
				912	static void XXH_free(void* p) { free(p); }
				913
				914	/! and for memcpy() /
				915	#include <string.h>
				916	static void* XXH_memcpy(void* dest, const void* src, size_t size)
				917	{
				918	return memcpy(dest,src,size);
				919	}
				920
				921	#include <limits.h> /* ULLONG_MAX */
				922
				923
				924	/* *************************************
				925	* Compiler Specific Options
				926	***************************************/
				927	#ifdef _MSC_VER /* Visual Studio warning fix */
				928	# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
				929	#endif
				930
				931	#if XXH_NO_INLINE_HINTS /* disable inlining hints */
				932	# if defined(__GNUC__)
				933	# define XXH_FORCE_INLINE static __attribute__((unused))
				934	# else
				935	# define XXH_FORCE_INLINE static
				936	# endif
				937	# define XXH_NO_INLINE static
				938	/* enable inlining hints */
				939	#elif defined(_MSC_VER) /* Visual Studio */
				940	# define XXH_FORCE_INLINE static __forceinline
				941	# define XXH_NO_INLINE static __declspec(noinline)
				942	#elif defined(__GNUC__)
				943	# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
				944	# define XXH_NO_INLINE static __attribute__((noinline))
				945	#elif defined (__cplusplus) \
				946	\|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */
				947	# define XXH_FORCE_INLINE static inline
				948	# define XXH_NO_INLINE static
				949	#else
				950	# define XXH_FORCE_INLINE static
				951	# define XXH_NO_INLINE static
				952	#endif
				953
				954
				955
				956	/* *************************************
				957	* Debug
				958	***************************************/
				959	/*
				960	* XXH_DEBUGLEVEL is expected to be defined externally, typically via the
				961	* compiler's command line options. The value must be a number.
				962	*/
				963	#ifndef XXH_DEBUGLEVEL
				964	# ifdef DEBUGLEVEL /* backwards compat */
				965	# define XXH_DEBUGLEVEL DEBUGLEVEL
				966	# else
				967	# define XXH_DEBUGLEVEL 0
				968	# endif
				969	#endif
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	970
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	971	#if (XXH_DEBUGLEVEL>=1)
				972	# include <assert.h> /* note: can still be disabled with NDEBUG */
				973	# define XXH_ASSERT(c) assert(c)
				974	#else
				975	# define XXH_ASSERT(c) ((void)0)
				976	#endif
				977
				978	/* note: use after variable declarations */
				979	#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
				980
				981
				982	/* *************************************
				983	* Basic Types
				984	***************************************/
				985	#if !defined (__VMS) \
				986	&& (defined (__cplusplus) \
				987	\|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
				988	# include <stdint.h>
				989	typedef uint8_t xxh_u8;
				990	#else
				991	typedef unsigned char xxh_u8;
				992	#endif
				993	typedef XXH32_hash_t xxh_u32;
				994
				995	#ifdef XXH_OLD_NAMES
				996	# define BYTE xxh_u8
				997	# define U8 xxh_u8
				998	# define U32 xxh_u32
				999	#endif
				1000
				1001	/* * Memory access * */
				1002
				1003	#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
				1004	/*
				1005	* Manual byteshift. Best for old compilers which don't inline memcpy.
				1006	* We actually directly use XXH_readLE32 and XXH_readBE32.
				1007	*/
				1008	#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
				1009
				1010	/*
				1011	* Force direct memory access. Only works on CPU which support unaligned memory
				1012	* access in hardware.
				1013	*/
				1014	static xxh_u32 XXH_read32(const void* memPtr) { return (const xxh_u32) memPtr; }
				1015
				1016	#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
				1017
				1018	/*
				1019	* __pack instructions are safer but compiler specific, hence potentially
				1020	* problematic for some compilers.
				1021	*
				1022	* Currently only defined for GCC and ICC.
				1023	*/
				1024	#ifdef XXH_OLD_NAMES
				1025	typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
				1026	#endif
				1027	static xxh_u32 XXH_read32(const void* ptr)
				1028	{
				1029	typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
				1030	return ((const xxh_unalign*)ptr)->u32;
				1031	}
				1032
				1033	#else
				1034
				1035	/*
				1036	* Portable and safe solution. Generally efficient.
				1037	* see: https://stackoverflow.com/a/32095106/646947
				1038	*/
				1039	static xxh_u32 XXH_read32(const void* memPtr)
				1040	{
				1041	xxh_u32 val;
				1042	memcpy(&val, memPtr, sizeof(val));
				1043	return val;
				1044	}
				1045
				1046	#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
				1047
				1048
Ilya Shipitsin	1e9a666	2021-01-05 22:10:46 +0500	[diff] [blame]	1049	/* * Endianness * */
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1050	typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
				1051
				1052	/*!
				1053	* XXH_CPU_LITTLE_ENDIAN:
				1054	* Defined to 1 if the target is little endian, or 0 if it is big endian.
				1055	* It can be defined externally, for example on the compiler command line.
				1056	*
				1057	* If it is not defined, a runtime check (which is usually constant folded)
				1058	* is used instead.
				1059	*/
				1060	#ifndef XXH_CPU_LITTLE_ENDIAN
				1061	/*
				1062	* Try to detect endianness automatically, to avoid the nonstandard behavior
				1063	* in `XXH_isLittleEndian()`
				1064	*/
				1065	# if defined(_WIN32) /* Windows is always little endian */ \
				1066	\|\| defined(__LITTLE_ENDIAN__) \
				1067	\|\| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
				1068	# define XXH_CPU_LITTLE_ENDIAN 1
				1069	# elif defined(__BIG_ENDIAN__) \
				1070	\|\| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
				1071	# define XXH_CPU_LITTLE_ENDIAN 0
				1072	# else
				1073	/*
				1074	* runtime test, presumed to simplify to a constant by compiler
				1075	*/
				1076	static int XXH_isLittleEndian(void)
				1077	{
				1078	/*
				1079	* Portable and well-defined behavior.
				1080	* Don't use static: it is detrimental to performance.
				1081	*/
				1082	const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
				1083	return one.c[0];
				1084	}
				1085	# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
				1086	# endif
				1087	#endif
				1088
				1089
				1090
				1091
				1092	/* ****************************************
				1093	* Compiler-specific Functions and Macros
				1094	******************************************/
				1095	#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
				1096
				1097	#ifdef __has_builtin
				1098	# define XXH_HAS_BUILTIN(x) __has_builtin(x)
				1099	#else
				1100	# define XXH_HAS_BUILTIN(x) 0
				1101	#endif
				1102
				1103	#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
				1104	&& XXH_HAS_BUILTIN(__builtin_rotateleft64)
				1105	# define XXH_rotl32 __builtin_rotateleft32
				1106	# define XXH_rotl64 __builtin_rotateleft64
				1107	/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
				1108	#elif defined(_MSC_VER)
				1109	# define XXH_rotl32(x,r) _rotl(x,r)
				1110	# define XXH_rotl64(x,r) _rotl64(x,r)
				1111	#else
				1112	# define XXH_rotl32(x,r) (((x) << (r)) \| ((x) >> (32 - (r))))
				1113	# define XXH_rotl64(x,r) (((x) << (r)) \| ((x) >> (64 - (r))))
				1114	#endif
				1115
				1116	#if defined(_MSC_VER) /* Visual Studio */
				1117	# define XXH_swap32 _byteswap_ulong
				1118	#elif XXH_GCC_VERSION >= 403
				1119	# define XXH_swap32 __builtin_bswap32
				1120	#else
				1121	static xxh_u32 XXH_swap32 (xxh_u32 x)
				1122	{
				1123	return ((x << 24) & 0xff000000 ) \|
				1124	((x << 8) & 0x00ff0000 ) \|
				1125	((x >> 8) & 0x0000ff00 ) \|
				1126	((x >> 24) & 0x000000ff );
				1127	}
				1128	#endif
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1129
				1130
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1131	/* ***************************
				1132	* Memory reads
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1133	*****************************/
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1134	typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1135
				1136	/*
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1137	* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
				1138	*
				1139	* This is ideal for older compilers which don't inline memcpy.
				1140	*/
				1141	#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1142
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1143	XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
				1144	{
				1145	const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
				1146	return bytePtr[0]
				1147	\| ((xxh_u32)bytePtr[1] << 8)
				1148	\| ((xxh_u32)bytePtr[2] << 16)
				1149	\| ((xxh_u32)bytePtr[3] << 24);
				1150	}
				1151
				1152	XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
				1153	{
				1154	const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
				1155	return bytePtr[3]
				1156	\| ((xxh_u32)bytePtr[2] << 8)
				1157	\| ((xxh_u32)bytePtr[1] << 16)
				1158	\| ((xxh_u32)bytePtr[0] << 24);
				1159	}
				1160
				1161	#else
				1162	XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
				1163	{
				1164	return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
				1165	}
				1166
				1167	static xxh_u32 XXH_readBE32(const void* ptr)
				1168	{
				1169	return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
				1170	}
				1171	#endif
				1172
				1173	XXH_FORCE_INLINE xxh_u32
				1174	XXH_readLE32_align(const void* ptr, XXH_alignment align)
				1175	{
				1176	if (align==XXH_unaligned) {
				1177	return XXH_readLE32(ptr);
				1178	} else {
				1179	return XXH_CPU_LITTLE_ENDIAN ? (const xxh_u32)ptr : XXH_swap32((const xxh_u32)ptr);
				1180	}
				1181	}
				1182
				1183
				1184	/* *************************************
				1185	* Misc
				1186	***************************************/
				1187	XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
				1188
				1189
				1190	/* *******************************************************************
				1191	* 32-bit hash functions
				1192	*********************************************************************/
				1193	static const xxh_u32 XXH_PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */
				1194	static const xxh_u32 XXH_PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */
				1195	static const xxh_u32 XXH_PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */
				1196	static const xxh_u32 XXH_PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */
				1197	static const xxh_u32 XXH_PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */
				1198
				1199	#ifdef XXH_OLD_NAMES
				1200	# define PRIME32_1 XXH_PRIME32_1
				1201	# define PRIME32_2 XXH_PRIME32_2
				1202	# define PRIME32_3 XXH_PRIME32_3
				1203	# define PRIME32_4 XXH_PRIME32_4
				1204	# define PRIME32_5 XXH_PRIME32_5
				1205	#endif
				1206
				1207	static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
				1208	{
				1209	acc += input * XXH_PRIME32_2;
				1210	acc = XXH_rotl32(acc, 13);
				1211	acc *= XXH_PRIME32_1;
				1212	#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
				1213	/*
				1214	* UGLY HACK:
				1215	* This inline assembly hack forces acc into a normal register. This is the
				1216	* only thing that prevents GCC and Clang from autovectorizing the XXH32
				1217	* loop (pragmas and attributes don't work for some resason) without globally
				1218	* disabling SSE4.1.
				1219	*
				1220	* The reason we want to avoid vectorization is because despite working on
				1221	* 4 integers at a time, there are multiple factors slowing XXH32 down on
				1222	* SSE4:
				1223	* - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
				1224	* newer chips!) making it slightly slower to multiply four integers at
				1225	* once compared to four integers independently. Even when pmulld was
				1226	* fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
				1227	* just to multiply unless doing a long operation.
				1228	*
				1229	* - Four instructions are required to rotate,
				1230	* movqda tmp, v // not required with VEX encoding
				1231	* pslld tmp, 13 // tmp <<= 13
				1232	* psrld v, 19 // x >>= 19
				1233	* por v, tmp // x \|= tmp
				1234	* compared to one for scalar:
				1235	* roll v, 13 // reliably fast across the board
				1236	* shldl v, v, 13 // Sandy Bridge and later prefer this for some reason
				1237	*
				1238	* - Instruction level parallelism is actually more beneficial here because
				1239	* the SIMD actually serializes this operation: While v1 is rotating, v2
				1240	* can load data, while v3 can multiply. SSE forces them to operate
				1241	* together.
				1242	*
				1243	* How this hack works:
				1244	* __asm__("" // Declare an assembly block but don't declare any instructions
				1245	* : // However, as an Input/Output Operand,
				1246	* "+r" // constrain a read/write operand (+) as a general purpose register (r).
				1247	* (acc) // and set acc as the operand
				1248	* );
				1249	*
				1250	* Because of the 'r', the compiler has promised that seed will be in a
				1251	* general purpose register and the '+' says that it will be 'read/write',
				1252	* so it has to assume it has changed. It is like volatile without all the
				1253	* loads and stores.
				1254	*
				1255	* Since the argument has to be in a normal register (not an SSE register),
				1256	* each time XXH32_round is called, it is impossible to vectorize.
				1257	*/
				1258	__asm__("" : "+r" (acc));
				1259	#endif
				1260	return acc;
				1261	}
				1262
				1263	/* mix all bits */
				1264	static xxh_u32 XXH32_avalanche(xxh_u32 h32)
				1265	{
				1266	h32 ^= h32 >> 15;
				1267	h32 *= XXH_PRIME32_2;
				1268	h32 ^= h32 >> 13;
				1269	h32 *= XXH_PRIME32_3;
				1270	h32 ^= h32 >> 16;
				1271	return(h32);
				1272	}
				1273
				1274	#define XXH_get32bits(p) XXH_readLE32_align(p, align)
				1275
				1276	static xxh_u32
				1277	XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
				1278	{
				1279	#define XXH_PROCESS1 do { \
				1280	h32 += (ptr++) XXH_PRIME32_5; \
				1281	h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \
				1282	} while (0)
				1283
				1284	#define XXH_PROCESS4 do { \
				1285	h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \
				1286	ptr += 4; \
				1287	h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \
				1288	} while (0)
				1289
				1290	/* Compact rerolled version */
				1291	if (XXH_REROLL) {
				1292	len &= 15;
				1293	while (len >= 4) {
				1294	XXH_PROCESS4;
				1295	len -= 4;
				1296	}
				1297	while (len > 0) {
				1298	XXH_PROCESS1;
				1299	--len;
				1300	}
				1301	return XXH32_avalanche(h32);
				1302	} else {
				1303	switch(len&15) /* or switch(bEnd - p) */ {
				1304	case 12: XXH_PROCESS4;
				1305	/* fallthrough */
				1306	case 8: XXH_PROCESS4;
				1307	/* fallthrough */
				1308	case 4: XXH_PROCESS4;
				1309	return XXH32_avalanche(h32);
				1310
				1311	case 13: XXH_PROCESS4;
				1312	/* fallthrough */
				1313	case 9: XXH_PROCESS4;
				1314	/* fallthrough */
				1315	case 5: XXH_PROCESS4;
				1316	XXH_PROCESS1;
				1317	return XXH32_avalanche(h32);
				1318
				1319	case 14: XXH_PROCESS4;
				1320	/* fallthrough */
				1321	case 10: XXH_PROCESS4;
				1322	/* fallthrough */
				1323	case 6: XXH_PROCESS4;
				1324	XXH_PROCESS1;
				1325	XXH_PROCESS1;
				1326	return XXH32_avalanche(h32);
				1327
				1328	case 15: XXH_PROCESS4;
				1329	/* fallthrough */
				1330	case 11: XXH_PROCESS4;
				1331	/* fallthrough */
				1332	case 7: XXH_PROCESS4;
				1333	/* fallthrough */
				1334	case 3: XXH_PROCESS1;
				1335	/* fallthrough */
				1336	case 2: XXH_PROCESS1;
				1337	/* fallthrough */
				1338	case 1: XXH_PROCESS1;
				1339	/* fallthrough */
				1340	case 0: return XXH32_avalanche(h32);
				1341	}
				1342	XXH_ASSERT(0);
				1343	return h32; /* reaching this point is deemed impossible */
				1344	}
				1345	}
				1346
				1347	#ifdef XXH_OLD_NAMES
				1348	# define PROCESS1 XXH_PROCESS1
				1349	# define PROCESS4 XXH_PROCESS4
				1350	#else
				1351	# undef XXH_PROCESS1
				1352	# undef XXH_PROCESS4
				1353	#endif
				1354
				1355	XXH_FORCE_INLINE xxh_u32
				1356	XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
				1357	{
				1358	const xxh_u8* bEnd = input + len;
				1359	xxh_u32 h32;
				1360
				1361	#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
				1362	if (input==NULL) {
				1363	len=0;
				1364	bEnd=input=(const xxh_u8*)(size_t)16;
				1365	}
				1366	#endif
				1367
				1368	if (len>=16) {
				1369	const xxh_u8* const limit = bEnd - 15;
				1370	xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
				1371	xxh_u32 v2 = seed + XXH_PRIME32_2;
				1372	xxh_u32 v3 = seed + 0;
				1373	xxh_u32 v4 = seed - XXH_PRIME32_1;
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1374
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1375	do {
				1376	v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
				1377	v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
				1378	v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
				1379	v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
				1380	} while (input < limit);
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1381
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1382	h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)
				1383	+ XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
				1384	} else {
				1385	h32 = seed + XXH_PRIME32_5;
				1386	}
				1387
				1388	h32 += (xxh_u32)len;
				1389
				1390	return XXH32_finalize(h32, input, len&15, align);
				1391	}
				1392
				1393
				1394	XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
				1395	{
				1396	#if 0
				1397	/* Simple version, good for code maintenance, but unfortunately slow for small inputs */
				1398	XXH32_state_t state;
				1399	XXH32_reset(&state, seed);
				1400	XXH32_update(&state, (const xxh_u8*)input, len);
				1401	return XXH32_digest(&state);
				1402
				1403	#else
				1404
				1405	if (XXH_FORCE_ALIGN_CHECK) {
				1406	if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */
				1407	return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
				1408	} }
				1409
				1410	return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
				1411	#endif
				1412	}
				1413
				1414
				1415
				1416	/***** Hash streaming *****/
				1417
				1418	XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
				1419	{
				1420	return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
				1421	}
				1422	XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
				1423	{
				1424	XXH_free(statePtr);
				1425	return XXH_OK;
				1426	}
				1427
				1428	XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
				1429	{
				1430	memcpy(dstState, srcState, sizeof(*dstState));
				1431	}
				1432
				1433	XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
				1434	{
				1435	XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
				1436	memset(&state, 0, sizeof(state));
				1437	state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
				1438	state.v2 = seed + XXH_PRIME32_2;
				1439	state.v3 = seed + 0;
				1440	state.v4 = seed - XXH_PRIME32_1;
				1441	/* do not write into reserved, planned to be removed in a future version */
				1442	memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
				1443	return XXH_OK;
				1444	}
				1445
				1446
				1447	XXH_PUBLIC_API XXH_errorcode
				1448	XXH32_update(XXH32_state_t* state, const void* input, size_t len)
				1449	{
				1450	if (input==NULL)
				1451	#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
				1452	return XXH_OK;
				1453	#else
				1454	return XXH_ERROR;
				1455	#endif
				1456
				1457	{ const xxh_u8* p = (const xxh_u8*)input;
				1458	const xxh_u8* const bEnd = p + len;
				1459
				1460	state->total_len_32 += (XXH32_hash_t)len;
				1461	state->large_len \|= (XXH32_hash_t)((len>=16) \| (state->total_len_32>=16));
				1462
				1463	if (state->memsize + len < 16) { /* fill in tmp buffer */
				1464	XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
				1465	state->memsize += (XXH32_hash_t)len;
				1466	return XXH_OK;
				1467	}
				1468
				1469	if (state->memsize) { /* some data left from previous update */
				1470	XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
				1471	{ const xxh_u32* p32 = state->mem32;
				1472	state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
				1473	state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
				1474	state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
				1475	state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
				1476	}
				1477	p += 16-state->memsize;
				1478	state->memsize = 0;
				1479	}
				1480
				1481	if (p <= bEnd-16) {
				1482	const xxh_u8* const limit = bEnd - 16;
				1483	xxh_u32 v1 = state->v1;
				1484	xxh_u32 v2 = state->v2;
				1485	xxh_u32 v3 = state->v3;
				1486	xxh_u32 v4 = state->v4;
				1487
				1488	do {
				1489	v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
				1490	v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
				1491	v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
				1492	v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
				1493	} while (p<=limit);
				1494
				1495	state->v1 = v1;
				1496	state->v2 = v2;
				1497	state->v3 = v3;
				1498	state->v4 = v4;
				1499	}
				1500
				1501	if (p < bEnd) {
				1502	XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
				1503	state->memsize = (unsigned)(bEnd-p);
				1504	}
				1505	}
				1506
				1507	return XXH_OK;
				1508	}
				1509
				1510
				1511	XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
				1512	{
				1513	xxh_u32 h32;
				1514
				1515	if (state->large_len) {
				1516	h32 = XXH_rotl32(state->v1, 1)
				1517	+ XXH_rotl32(state->v2, 7)
				1518	+ XXH_rotl32(state->v3, 12)
				1519	+ XXH_rotl32(state->v4, 18);
				1520	} else {
				1521	h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
				1522	}
				1523
				1524	h32 += state->total_len_32;
				1525
				1526	return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
				1527	}
				1528
				1529
				1530	/***** Canonical representation *****/
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1531
				1532	/*
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1533	* The default return values from XXH functions are unsigned 32 and 64 bit
				1534	* integers.
				1535	*
				1536	* The canonical representation uses big endian convention, the same convention
				1537	* as human-readable numbers (large digits first).
				1538	*
				1539	* This way, hash values can be written into a file or buffer, remaining
				1540	* comparable across different systems.
				1541	*
				1542	* The following functions allow transformation of hash values to and from their
				1543	* canonical format.
				1544	*/
				1545	XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
				1546	{
				1547	XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
				1548	if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
				1549	memcpy(dst, &hash, sizeof(*dst));
				1550	}
				1551
				1552	XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
				1553	{
				1554	return XXH_readBE32(src);
				1555	}
				1556
				1557
				1558	#ifndef XXH_NO_LONG_LONG
				1559
				1560	/* *******************************************************************
				1561	* 64-bit hash functions
				1562	*********************************************************************/
				1563
				1564	/***** Memory access *****/
				1565
				1566	typedef XXH64_hash_t xxh_u64;
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1567
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1568	#ifdef XXH_OLD_NAMES
				1569	# define U64 xxh_u64
				1570	#endif
				1571
				1572	/*!
				1573	* XXH_REROLL_XXH64:
				1574	* Whether to reroll the XXH64_finalize() loop.
				1575	*
				1576	* Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a
				1577	* performance gain on 64-bit hosts, as only one jump is required.
				1578	*
				1579	* However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit
				1580	* registers, and 64-bit arithmetic needs to be simulated, it isn't beneficial
				1581	* to unroll. The code becomes ridiculously large (the largest function in the
				1582	* binary on i386!), and rerolling it saves anywhere from 3kB to 20kB. It is
				1583	* also slightly faster because it fits into cache better and is more likely
				1584	* to be inlined by the compiler.
				1585	*
				1586	* If XXH_REROLL is defined, this is ignored and the loop is always rerolled.
				1587	*/
				1588	#ifndef XXH_REROLL_XXH64
				1589	# if (defined(__ILP32__) \|\| defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
				1590	\|\| !(defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(_M_AMD64) /* x86-64 */ \
				1591	\|\| defined(_M_ARM64) \|\| defined(__aarch64__) \|\| defined(__arm64__) /* aarch64 */ \
				1592	\|\| defined(__PPC64__) \|\| defined(__PPC64LE__) \|\| defined(__ppc64__) \|\| defined(__powerpc64__) /* ppc64 */ \
				1593	\|\| defined(__mips64__) \|\| defined(__mips64)) /* mips64 */ \
				1594	\|\| (!defined(SIZE_MAX) \|\| SIZE_MAX < ULLONG_MAX) /* check limits */
				1595	# define XXH_REROLL_XXH64 1
				1596	# else
				1597	# define XXH_REROLL_XXH64 0
				1598	# endif
				1599	#endif /* !defined(XXH_REROLL_XXH64) */
				1600
				1601	#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
				1602	/*
				1603	* Manual byteshift. Best for old compilers which don't inline memcpy.
				1604	* We actually directly use XXH_readLE64 and XXH_readBE64.
				1605	*/
				1606	#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1607
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1608	/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
				1609	static xxh_u64 XXH_read64(const void* memPtr) { return (const xxh_u64) memPtr; }
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1610
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1611	#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1612
				1613	/*
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1614	* __pack instructions are safer, but compiler specific, hence potentially
				1615	* problematic for some compilers.
				1616	*
				1617	* Currently only defined for GCC and ICC.
				1618	*/
				1619	#ifdef XXH_OLD_NAMES
				1620	typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
				1621	#endif
				1622	static xxh_u64 XXH_read64(const void* ptr)
				1623	{
				1624	typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
				1625	return ((const xxh_unalign64*)ptr)->u64;
				1626	}
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1627
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1628	#else
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1629
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1630	/*
				1631	* Portable and safe solution. Generally efficient.
				1632	* see: https://stackoverflow.com/a/32095106/646947
				1633	*/
				1634	static xxh_u64 XXH_read64(const void* memPtr)
				1635	{
				1636	xxh_u64 val;
				1637	memcpy(&val, memPtr, sizeof(val));
				1638	return val;
				1639	}
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1640
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1641	#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1642
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1643	#if defined(_MSC_VER) /* Visual Studio */
				1644	# define XXH_swap64 _byteswap_uint64
				1645	#elif XXH_GCC_VERSION >= 403
				1646	# define XXH_swap64 __builtin_bswap64
				1647	#else
				1648	static xxh_u64 XXH_swap64 (xxh_u64 x)
				1649	{
				1650	return ((x << 56) & 0xff00000000000000ULL) \|
				1651	((x << 40) & 0x00ff000000000000ULL) \|
				1652	((x << 24) & 0x0000ff0000000000ULL) \|
				1653	((x << 8) & 0x000000ff00000000ULL) \|
				1654	((x >> 8) & 0x00000000ff000000ULL) \|
				1655	((x >> 24) & 0x0000000000ff0000ULL) \|
				1656	((x >> 40) & 0x000000000000ff00ULL) \|
				1657	((x >> 56) & 0x00000000000000ffULL);
				1658	}
				1659	#endif
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	1660
Dragan Dosen	de37443	2020-12-22 12:00:37 +0100	[diff] [blame]	1661
				1662	/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
				1663	#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
				1664
				1665	XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
				1666	{
				1667	const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
				1668	return bytePtr[0]
				1669	\| ((xxh_u64)bytePtr[1] << 8)
				1670	\| ((xxh_u64)bytePtr[2] << 16)
				1671	\| ((xxh_u64)bytePtr[3] << 24)
				1672	\| ((xxh_u64)bytePtr[4] << 32)
				1673	\| ((xxh_u64)bytePtr[5] << 40)
				1674	\| ((xxh_u64)bytePtr[6] << 48)
				1675	\| ((xxh_u64)bytePtr[7] << 56);
				1676	}
				1677
				1678	XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
				1679	{
				1680	const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
				1681	return bytePtr[7]
				1682	\| ((xxh_u64)bytePtr[6] << 8)
				1683	\| ((xxh_u64)bytePtr[5] << 16)
				1684	\| ((xxh_u64)bytePtr[4] << 24)
				1685	\| ((xxh_u64)bytePtr[3] << 32)
				1686	\| ((xxh_u64)bytePtr[2] << 40)
				1687	\| ((xxh_u64)bytePtr[1] << 48)
				1688	\| ((xxh_u64)bytePtr[0] << 56);
				1689	}
				1690
				1691	#else
				1692	XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
				1693	{
				1694	return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
				1695	}
				1696
				1697	static xxh_u64 XXH_readBE64(const void* ptr)
				1698	{
				1699	return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
				1700	}
				1701	#endif
				1702
				1703	XXH_FORCE_INLINE xxh_u64
				1704	XXH_readLE64_align(const void* ptr, XXH_alignment align)
				1705	{
				1706	if (align==XXH_unaligned)
				1707	return XXH_readLE64(ptr);
				1708	else
				1709	return XXH_CPU_LITTLE_ENDIAN ? (const xxh_u64)ptr : XXH_swap64((const xxh_u64)ptr);
				1710	}
				1711
				1712
				1713	/***** xxh64 *****/
				1714
				1715	static const xxh_u64 XXH_PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
				1716	static const xxh_u64 XXH_PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
				1717	static const xxh_u64 XXH_PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
				1718	static const xxh_u64 XXH_PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
				1719	static const xxh_u64 XXH_PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
				1720
				1721	#ifdef XXH_OLD_NAMES
				1722	# define PRIME64_1 XXH_PRIME64_1
				1723	# define PRIME64_2 XXH_PRIME64_2
				1724	# define PRIME64_3 XXH_PRIME64_3
				1725	# define PRIME64_4 XXH_PRIME64_4
				1726	# define PRIME64_5 XXH_PRIME64_5
				1727	#endif
				1728
				1729	static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
				1730	{
				1731	acc += input * XXH_PRIME64_2;
				1732	acc = XXH_rotl64(acc, 31);
				1733	acc *= XXH_PRIME64_1;
				1734	return acc;
				1735	}
				1736
				1737	static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
				1738	{
				1739	val = XXH64_round(0, val);
				1740	acc ^= val;
				1741	acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
				1742	return acc;
				1743	}
				1744
				1745	static xxh_u64 XXH64_avalanche(xxh_u64 h64)
				1746	{
				1747	h64 ^= h64 >> 33;
				1748	h64 *= XXH_PRIME64_2;
				1749	h64 ^= h64 >> 29;
				1750	h64 *= XXH_PRIME64_3;
				1751	h64 ^= h64 >> 32;
				1752	return h64;
				1753	}
				1754
				1755
				1756	#define XXH_get64bits(p) XXH_readLE64_align(p, align)
				1757
				1758	static xxh_u64
				1759	XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
				1760	{
				1761	#define XXH_PROCESS1_64 do { \
				1762	h64 ^= (ptr++) XXH_PRIME64_5; \
				1763	h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; \
				1764	} while (0)
				1765
				1766	#define XXH_PROCESS4_64 do { \
				1767	h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; \
				1768	ptr += 4; \
				1769	h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; \
				1770	} while (0)
				1771
				1772	#define XXH_PROCESS8_64 do { \
				1773	xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
				1774	ptr += 8; \
				1775	h64 ^= k1; \
				1776	h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4; \
				1777	} while (0)
				1778
				1779	/* Rerolled version for 32-bit targets is faster and much smaller. */
				1780	if (XXH_REROLL \|\| XXH_REROLL_XXH64) {
				1781	len &= 31;
				1782	while (len >= 8) {
				1783	XXH_PROCESS8_64;
				1784	len -= 8;
				1785	}
				1786	if (len >= 4) {
				1787	XXH_PROCESS4_64;
				1788	len -= 4;
				1789	}
				1790	while (len > 0) {
				1791	XXH_PROCESS1_64;
				1792	--len;
				1793	}
				1794	return XXH64_avalanche(h64);
				1795	} else {
				1796	switch(len & 31) {
				1797	case 24: XXH_PROCESS8_64;
				1798	/* fallthrough */
				1799	case 16: XXH_PROCESS8_64;
				1800	/* fallthrough */
				1801	case 8: XXH_PROCESS8_64;
				1802	return XXH64_avalanche(h64);
				1803
				1804	case 28: XXH_PROCESS8_64;
				1805	/* fallthrough */
				1806	case 20: XXH_PROCESS8_64;
				1807	/* fallthrough */
				1808	case 12: XXH_PROCESS8_64;
				1809	/* fallthrough */
				1810	case 4: XXH_PROCESS4_64;
				1811	return XXH64_avalanche(h64);
				1812
				1813	case 25: XXH_PROCESS8_64;
				1814	/* fallthrough */
				1815	case 17: XXH_PROCESS8_64;
				1816	/* fallthrough */
				1817	case 9: XXH_PROCESS8_64;
				1818	XXH_PROCESS1_64;
				1819	return XXH64_avalanche(h64);
				1820
				1821	case 29: XXH_PROCESS8_64;
				1822	/* fallthrough */
				1823	case 21: XXH_PROCESS8_64;
				1824	/* fallthrough */
				1825	case 13: XXH_PROCESS8_64;
				1826	/* fallthrough */
				1827	case 5: XXH_PROCESS4_64;
				1828	XXH_PROCESS1_64;
				1829	return XXH64_avalanche(h64);
				1830
				1831	case 26: XXH_PROCESS8_64;
				1832	/* fallthrough */
				1833	case 18: XXH_PROCESS8_64;
				1834	/* fallthrough */
				1835	case 10: XXH_PROCESS8_64;
				1836	XXH_PROCESS1_64;
				1837	XXH_PROCESS1_64;
				1838	return XXH64_avalanche(h64);
				1839
				1840	case 30: XXH_PROCESS8_64;
				1841	/* fallthrough */
				1842	case 22: XXH_PROCESS8_64;
				1843	/* fallthrough */
				1844	case 14: XXH_PROCESS8_64;
				1845	/* fallthrough */
				1846	case 6: XXH_PROCESS4_64;
				1847	XXH_PROCESS1_64;
				1848	XXH_PROCESS1_64;
				1849	return XXH64_avalanche(h64);
				1850
				1851	case 27: XXH_PROCESS8_64;
				1852	/* fallthrough */
				1853	case 19: XXH_PROCESS8_64;
				1854	/* fallthrough */
				1855	case 11: XXH_PROCESS8_64;
				1856	XXH_PROCESS1_64;
				1857	XXH_PROCESS1_64;
				1858	XXH_PROCESS1_64;
				1859	return XXH64_avalanche(h64);
				1860
				1861	case 31: XXH_PROCESS8_64;
				1862	/* fallthrough */
				1863	case 23: XXH_PROCESS8_64;
				1864	/* fallthrough */
				1865	case 15: XXH_PROCESS8_64;
				1866	/* fallthrough */
				1867	case 7: XXH_PROCESS4_64;
				1868	/* fallthrough */
				1869	case 3: XXH_PROCESS1_64;
				1870	/* fallthrough */
				1871	case 2: XXH_PROCESS1_64;
				1872	/* fallthrough */
				1873	case 1: XXH_PROCESS1_64;
				1874	/* fallthrough */
				1875	case 0: return XXH64_avalanche(h64);
				1876	}
				1877	}
				1878	/* impossible to reach */
				1879	XXH_ASSERT(0);
				1880	return 0; /* unreachable, but some compilers complain without it */
				1881	}
				1882
				1883	#ifdef XXH_OLD_NAMES
				1884	# define PROCESS1_64 XXH_PROCESS1_64
				1885	# define PROCESS4_64 XXH_PROCESS4_64
				1886	# define PROCESS8_64 XXH_PROCESS8_64
				1887	#else
				1888	# undef XXH_PROCESS1_64
				1889	# undef XXH_PROCESS4_64
				1890	# undef XXH_PROCESS8_64
				1891	#endif
				1892
				1893	XXH_FORCE_INLINE xxh_u64
				1894	XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
				1895	{
				1896	const xxh_u8* bEnd = input + len;
				1897	xxh_u64 h64;
				1898
				1899	#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
				1900	if (input==NULL) {
				1901	len=0;
				1902	bEnd=input=(const xxh_u8*)(size_t)32;
				1903	}
				1904	#endif
				1905
				1906	if (len>=32) {
				1907	const xxh_u8* const limit = bEnd - 32;
				1908	xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
				1909	xxh_u64 v2 = seed + XXH_PRIME64_2;
				1910	xxh_u64 v3 = seed + 0;
				1911	xxh_u64 v4 = seed - XXH_PRIME64_1;
				1912
				1913	do {
				1914	v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
				1915	v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
				1916	v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
				1917	v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
				1918	} while (input<=limit);
				1919
				1920	h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
				1921	h64 = XXH64_mergeRound(h64, v1);
				1922	h64 = XXH64_mergeRound(h64, v2);
				1923	h64 = XXH64_mergeRound(h64, v3);
				1924	h64 = XXH64_mergeRound(h64, v4);
				1925
				1926	} else {
				1927	h64 = seed + XXH_PRIME64_5;
				1928	}
				1929
				1930	h64 += (xxh_u64) len;
				1931
				1932	return XXH64_finalize(h64, input, len, align);
				1933	}
				1934
				1935
				1936	XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
				1937	{
				1938	#if 0
				1939	/* Simple version, good for code maintenance, but unfortunately slow for small inputs */
				1940	XXH64_state_t state;
				1941	XXH64_reset(&state, seed);
				1942	XXH64_update(&state, (const xxh_u8*)input, len);
				1943	return XXH64_digest(&state);
				1944
				1945	#else
				1946
				1947	if (XXH_FORCE_ALIGN_CHECK) {
				1948	if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */
				1949	return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
				1950	} }
				1951
				1952	return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
				1953
				1954	#endif
				1955	}
				1956
				1957	/***** Hash Streaming *****/
				1958
				1959	XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
				1960	{
				1961	return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
				1962	}
				1963	XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
				1964	{
				1965	XXH_free(statePtr);
				1966	return XXH_OK;
				1967	}
				1968
				1969	XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
				1970	{
				1971	memcpy(dstState, srcState, sizeof(*dstState));
				1972	}
				1973
				1974	XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
				1975	{
				1976	XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
				1977	memset(&state, 0, sizeof(state));
				1978	state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
				1979	state.v2 = seed + XXH_PRIME64_2;
				1980	state.v3 = seed + 0;
				1981	state.v4 = seed - XXH_PRIME64_1;
				1982	/* do not write into reserved64, might be removed in a future version */
				1983	memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
				1984	return XXH_OK;
				1985	}
				1986
				1987	XXH_PUBLIC_API XXH_errorcode
				1988	XXH64_update (XXH64_state_t* state, const void* input, size_t len)
				1989	{
				1990	if (input==NULL)
				1991	#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
				1992	return XXH_OK;
				1993	#else
				1994	return XXH_ERROR;
				1995	#endif
				1996
				1997	{ const xxh_u8* p = (const xxh_u8*)input;
				1998	const xxh_u8* const bEnd = p + len;
				1999
				2000	state->total_len += len;
				2001
				2002	if (state->memsize + len < 32) { /* fill in tmp buffer */
				2003	XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
				2004	state->memsize += (xxh_u32)len;
				2005	return XXH_OK;
				2006	}
				2007
				2008	if (state->memsize) { /* tmp buffer is full */
				2009	XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
				2010	state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
				2011	state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
				2012	state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
				2013	state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
				2014	p += 32-state->memsize;
				2015	state->memsize = 0;
				2016	}
				2017
				2018	if (p+32 <= bEnd) {
				2019	const xxh_u8* const limit = bEnd - 32;
				2020	xxh_u64 v1 = state->v1;
				2021	xxh_u64 v2 = state->v2;
				2022	xxh_u64 v3 = state->v3;
				2023	xxh_u64 v4 = state->v4;
				2024
				2025	do {
				2026	v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
				2027	v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
				2028	v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
				2029	v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
				2030	} while (p<=limit);
				2031
				2032	state->v1 = v1;
				2033	state->v2 = v2;
				2034	state->v3 = v3;
				2035	state->v4 = v4;
				2036	}
				2037
				2038	if (p < bEnd) {
				2039	XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
				2040	state->memsize = (unsigned)(bEnd-p);
				2041	}
				2042	}
				2043
				2044	return XXH_OK;
				2045	}
				2046
				2047
				2048	XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
				2049	{
				2050	xxh_u64 h64;
				2051
				2052	if (state->total_len >= 32) {
				2053	xxh_u64 const v1 = state->v1;
				2054	xxh_u64 const v2 = state->v2;
				2055	xxh_u64 const v3 = state->v3;
				2056	xxh_u64 const v4 = state->v4;
				2057
				2058	h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
				2059	h64 = XXH64_mergeRound(h64, v1);
				2060	h64 = XXH64_mergeRound(h64, v2);
				2061	h64 = XXH64_mergeRound(h64, v3);
				2062	h64 = XXH64_mergeRound(h64, v4);
				2063	} else {
				2064	h64 = state->v3 /seed/ + XXH_PRIME64_5;
				2065	}
				2066
				2067	h64 += (xxh_u64) state->total_len;
				2068
				2069	return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
				2070	}
				2071
				2072
				2073	/***** Canonical representation *****/
				2074
				2075	XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
				2076	{
				2077	XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
				2078	if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
				2079	memcpy(dst, &hash, sizeof(*dst));
				2080	}
				2081
				2082	XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
				2083	{
				2084	return XXH_readBE64(src);
				2085	}
				2086
				2087
				2088
				2089	/* *********************************************************************
				2090	* XXH3
				2091	* New generation hash designed for speed on small keys and vectorization
				2092	************************************************************************ */
				2093
				2094	/* === Compiler specifics === */
				2095
				2096	#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
				2097	# define XXH_RESTRICT restrict
				2098	#else
				2099	/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
				2100	# define XXH_RESTRICT /* disable */
				2101	#endif
				2102
				2103	#if (defined(__GNUC__) && (__GNUC__ >= 3)) \
				2104	\|\| (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
				2105	\|\| defined(__clang__)
				2106	# define XXH_likely(x) __builtin_expect(x, 1)
				2107	# define XXH_unlikely(x) __builtin_expect(x, 0)
				2108	#else
				2109	# define XXH_likely(x) (x)
				2110	# define XXH_unlikely(x) (x)
				2111	#endif
				2112
				2113	#if defined(__GNUC__)
				2114	# if defined(__AVX2__)
				2115	# include <immintrin.h>
				2116	# elif defined(__SSE2__)
				2117	# include <emmintrin.h>
				2118	# elif defined(__ARM_NEON__) \|\| defined(__ARM_NEON)
				2119	# define inline __inline__ /* circumvent a clang bug */
				2120	# include <arm_neon.h>
				2121	# undef inline
				2122	# endif
				2123	#elif defined(_MSC_VER)
				2124	# include <intrin.h>
				2125	#endif
				2126
				2127	/*
				2128	* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
				2129	* remaining a true 64-bit/128-bit hash function.
				2130	*
				2131	* This is done by prioritizing a subset of 64-bit operations that can be
				2132	* emulated without too many steps on the average 32-bit machine.
				2133	*
				2134	* For example, these two lines seem similar, and run equally fast on 64-bit:
				2135	*
				2136	* xxh_u64 x;
				2137	* x ^= (x >> 47); // good
				2138	* x ^= (x >> 13); // bad
				2139	*
				2140	* However, to a 32-bit machine, there is a major difference.
				2141	*
				2142	* x ^= (x >> 47) looks like this:
				2143	*
				2144	* x.lo ^= (x.hi >> (47 - 32));
				2145	*
				2146	* while x ^= (x >> 13) looks like this:
				2147	*
				2148	* // note: funnel shifts are not usually cheap.
				2149	* x.lo ^= (x.lo >> 13) \| (x.hi << (32 - 13));
				2150	* x.hi ^= (x.hi >> 13);
				2151	*
				2152	* The first one is significantly faster than the second, simply because the
				2153	* shift is larger than 32. This means:
				2154	* - All the bits we need are in the upper 32 bits, so we can ignore the lower
				2155	* 32 bits in the shift.
				2156	* - The shift result will always fit in the lower 32 bits, and therefore,
				2157	* we can ignore the upper 32 bits in the xor.
				2158	*
				2159	* Thanks to this optimization, XXH3 only requires these features to be efficient:
				2160	*
				2161	* - Usable unaligned access
				2162	* - A 32-bit or 64-bit ALU
				2163	* - If 32-bit, a decent ADC instruction
				2164	* - A 32 or 64-bit multiply with a 64-bit result
				2165	* - For the 128-bit variant, a decent byteswap helps short inputs.
				2166	*
				2167	* The first two are already required by XXH32, and almost all 32-bit and 64-bit
				2168	* platforms which can run XXH32 can run XXH3 efficiently.
				2169	*
				2170	* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
				2171	* notable exception.
				2172	*
				2173	* First of all, Thumb-1 lacks support for the UMULL instruction which
				2174	* performs the important long multiply. This means numerous __aeabi_lmul
				2175	* calls.
				2176	*
				2177	* Second of all, the 8 functional registers are just not enough.
				2178	* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
				2179	* Lo registers, and this shuffling results in thousands more MOVs than A32.
				2180	*
				2181	* A32 and T32 don't have this limitation. They can access all 14 registers,
				2182	* do a 32->64 multiply with UMULL, and the flexible operand allowing free
				2183	* shifts is helpful, too.
				2184	*
				2185	* Therefore, we do a quick sanity check.
				2186	*
				2187	* If compiling Thumb-1 for a target which supports ARM instructions, we will
				2188	* emit a warning, as it is not a "sane" platform to compile for.
				2189	*
				2190	* Usually, if this happens, it is because of an accident and you probably need
				2191	* to specify -march, as you likely meant to compile for a newer architecture.
				2192	*
				2193	* Credit: large sections of the vectorial and asm source code paths
				2194	* have been contributed by @easyaspi314
				2195	*/
				2196	#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
				2197	# warning "XXH3 is highly inefficient without ARM or Thumb-2."
				2198	#endif
				2199
				2200	/* ==========================================
				2201	* Vectorization detection
				2202	* ========================================== */
				2203	#define XXH_SCALAR 0 /* Portable scalar version */
				2204	#define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */
				2205	#define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */
				2206	#define XXH_AVX512 3 /* AVX512 for Skylake and Icelake */
				2207	#define XXH_NEON 4 /* NEON for most ARMv7-A and all AArch64 */
				2208	#define XXH_VSX 5 /* VSX and ZVector for POWER8/z13 */
				2209
				2210	#ifndef XXH_VECTOR /* can be defined on command line */
				2211	# if defined(__AVX512F__)
				2212	# define XXH_VECTOR XXH_AVX512
				2213	# elif defined(__AVX2__)
				2214	# define XXH_VECTOR XXH_AVX2
				2215	# elif defined(__SSE2__) \|\| defined(_M_AMD64) \|\| defined(_M_X64) \|\| (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
				2216	# define XXH_VECTOR XXH_SSE2
				2217	# elif defined(__GNUC__) /* msvc support maybe later */ \
				2218	&& (defined(__ARM_NEON__) \|\| defined(__ARM_NEON)) \
				2219	&& (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
				2220	\|\| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
				2221	# define XXH_VECTOR XXH_NEON
				2222	# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
				2223	\|\| (defined(__s390x__) && defined(__VEC__)) \
				2224	&& defined(__GNUC__) /* TODO: IBM XL */
				2225	# define XXH_VECTOR XXH_VSX
				2226	# else
				2227	# define XXH_VECTOR XXH_SCALAR
				2228	# endif
				2229	#endif
				2230
				2231	/*
				2232	* Controls the alignment of the accumulator,
				2233	* for compatibility with aligned vector loads, which are usually faster.
				2234	*/
				2235	#ifndef XXH_ACC_ALIGN
				2236	# if defined(XXH_X86DISPATCH)
				2237	# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */
				2238	# elif XXH_VECTOR == XXH_SCALAR /* scalar */
				2239	# define XXH_ACC_ALIGN 8
				2240	# elif XXH_VECTOR == XXH_SSE2 /* sse2 */
				2241	# define XXH_ACC_ALIGN 16
				2242	# elif XXH_VECTOR == XXH_AVX2 /* avx2 */
				2243	# define XXH_ACC_ALIGN 32
				2244	# elif XXH_VECTOR == XXH_NEON /* neon */
				2245	# define XXH_ACC_ALIGN 16
				2246	# elif XXH_VECTOR == XXH_VSX /* vsx */
				2247	# define XXH_ACC_ALIGN 16
				2248	# elif XXH_VECTOR == XXH_AVX512 /* avx512 */
				2249	# define XXH_ACC_ALIGN 64
				2250	# endif
				2251	#endif
				2252
				2253	#if defined(XXH_X86DISPATCH) \|\| XXH_VECTOR == XXH_SSE2 \
				2254	\|\| XXH_VECTOR == XXH_AVX2 \|\| XXH_VECTOR == XXH_AVX512
				2255	# define XXH_SEC_ALIGN XXH_ACC_ALIGN
				2256	#else
				2257	# define XXH_SEC_ALIGN 8
				2258	#endif
				2259
				2260	/*
				2261	* UGLY HACK:
				2262	* GCC usually generates the best code with -O3 for xxHash.
				2263	*
				2264	* However, when targeting AVX2, it is overzealous in its unrolling resulting
				2265	* in code roughly 3/4 the speed of Clang.
				2266	*
				2267	* There are other issues, such as GCC splitting _mm256_loadu_si256 into
				2268	* _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
				2269	* only applies to Sandy and Ivy Bridge... which don't even support AVX2.
				2270	*
				2271	* That is why when compiling the AVX2 version, it is recommended to use either
				2272	* -O2 -mavx2 -march=haswell
				2273	* or
				2274	* -O2 -mavx2 -mno-avx256-split-unaligned-load
				2275	* for decent performance, or to use Clang instead.
				2276	*
				2277	* Fortunately, we can control the first one with a pragma that forces GCC into
				2278	* -O2, but the other one we can't control without "failed to inline always
				2279	* inline function due to target mismatch" warnings.
				2280	*/
				2281	#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
				2282	&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
				2283	&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
				2284	# pragma GCC push_options
				2285	# pragma GCC optimize("-O2")
				2286	#endif
				2287
				2288
				2289	#if XXH_VECTOR == XXH_NEON
				2290	/*
				2291	* NEON's setup for vmlal_u32 is a little more complicated than it is on
				2292	* SSE2, AVX2, and VSX.
				2293	*
				2294	* While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
				2295	*
				2296	* To do the same operation, the 128-bit 'Q' register needs to be split into
				2297	* two 64-bit 'D' registers, performing this operation::
				2298	*
				2299	* [ a \| b ]
				2300	* \| '---------. .--------' \|
				2301	* \| x \|
				2302	* \| .---------' '--------. \|
				2303	* [ a & 0xFFFFFFFF \| b & 0xFFFFFFFF ],[ a >> 32 \| b >> 32 ]
				2304	*
				2305	* Due to significant changes in aarch64, the fastest method for aarch64 is
				2306	* completely different than the fastest method for ARMv7-A.
				2307	*
				2308	* ARMv7-A treats D registers as unions overlaying Q registers, so modifying
				2309	* D11 will modify the high half of Q5. This is similar to how modifying AH
				2310	* will only affect bits 8-15 of AX on x86.
				2311	*
				2312	* VZIP takes two registers, and puts even lanes in one register and odd lanes
				2313	* in the other.
				2314	*
				2315	* On ARMv7-A, this strangely modifies both parameters in place instead of
				2316	* taking the usual 3-operand form.
				2317	*
				2318	* Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
				2319	* lower and upper halves of the Q register to end up with the high and low
				2320	* halves where we want - all in one instruction.
				2321	*
				2322	* vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
				2323	*
				2324	* Unfortunately we need inline assembly for this: Instructions modifying two
				2325	* registers at once is not possible in GCC or Clang's IR, and they have to
				2326	* create a copy.
				2327	*
				2328	* aarch64 requires a different approach.
				2329	*
				2330	* In order to make it easier to write a decent compiler for aarch64, many
				2331	* quirks were removed, such as conditional execution.
				2332	*
				2333	* NEON was also affected by this.
				2334	*
				2335	* aarch64 cannot access the high bits of a Q-form register, and writes to a
				2336	* D-form register zero the high bits, similar to how writes to W-form scalar
				2337	* registers (or DWORD registers on x86_64) work.
				2338	*
				2339	* The formerly free vget_high intrinsics now require a vext (with a few
				2340	* exceptions)
				2341	*
				2342	* Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
				2343	* of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
				2344	* operand.
				2345	*
				2346	* The equivalent of the VZIP.32 on the lower and upper halves would be this
				2347	* mess:
				2348	*
				2349	* ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
				2350	* zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
				2351	* zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
				2352	*
				2353	* Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
				2354	*
				2355	* shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
				2356	* xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
				2357	*
				2358	* This is available on ARMv7-A, but is less efficient than a single VZIP.32.
				2359	*/
				2360
				2361	/*
				2362	* Function-like macro:
				2363	* void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
				2364	* {
				2365	* outLo = (uint32x2_t)(in & 0xFFFFFFFF);
				2366	* outHi = (uint32x2_t)(in >> 32);
				2367	* in = UNDEFINED;
				2368	* }
				2369	*/
				2370	# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
				2371	&& defined(__GNUC__) \
				2372	&& !defined(__aarch64__) && !defined(__arm64__)
				2373	# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
				2374	do { \
				2375	/* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
				2376	/* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
				2377	/* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
				2378	__asm__("vzip.32 %e0, %f0" : "+w" (in)); \
				2379	(outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \
				2380	(outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
				2381	} while (0)
				2382	# else
				2383	# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
				2384	do { \
				2385	(outLo) = vmovn_u64 (in); \
				2386	(outHi) = vshrn_n_u64 ((in), 32); \
				2387	} while (0)
				2388	# endif
				2389	#endif /* XXH_VECTOR == XXH_NEON */
				2390
				2391	/*
				2392	* VSX and Z Vector helpers.
				2393	*
				2394	* This is very messy, and any pull requests to clean this up are welcome.
				2395	*
				2396	* There are a lot of problems with supporting VSX and s390x, due to
				2397	* inconsistent intrinsics, spotty coverage, and multiple endiannesses.
				2398	*/
				2399	#if XXH_VECTOR == XXH_VSX
				2400	# if defined(__s390x__)
				2401	# include <s390intrin.h>
				2402	# else
				2403	/* gcc's altivec.h can have the unwanted consequence to unconditionally
				2404	* #define bool, vector, and pixel keywords,
				2405	* with bad consequences for programs already using these keywords for other purposes.
				2406	* The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.
				2407	* __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,
				2408	* but it seems that, in some cases, it isn't.
				2409	* Force the build macro to be defined, so that keywords are not altered.
				2410	*/
				2411	# if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
				2412	# define __APPLE_ALTIVEC__
				2413	# endif
				2414	# include <altivec.h>
				2415	# endif
				2416
				2417	typedef __vector unsigned long long xxh_u64x2;
				2418	typedef __vector unsigned char xxh_u8x16;
				2419	typedef __vector unsigned xxh_u32x4;
				2420
				2421	# ifndef XXH_VSX_BE
				2422	# if defined(__BIG_ENDIAN__) \
				2423	\|\| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
				2424	# define XXH_VSX_BE 1
				2425	# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
				2426	# warning "-maltivec=be is not recommended. Please use native endianness."
				2427	# define XXH_VSX_BE 1
				2428	# else
				2429	# define XXH_VSX_BE 0
				2430	# endif
				2431	# endif /* !defined(XXH_VSX_BE) */
				2432
				2433	# if XXH_VSX_BE
				2434	/* A wrapper for POWER9's vec_revb. */
				2435	# if defined(__POWER9_VECTOR__) \|\| (defined(__clang__) && defined(__s390x__))
				2436	# define XXH_vec_revb vec_revb
				2437	# else
				2438	XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
				2439	{
				2440	xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
				2441	0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
				2442	return vec_perm(val, val, vByteSwap);
				2443	}
				2444	# endif
				2445	# endif /* XXH_VSX_BE */
				2446
				2447	/*
				2448	* Performs an unaligned load and byte swaps it on big endian.
				2449	*/
				2450	XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
				2451	{
				2452	xxh_u64x2 ret;
				2453	memcpy(&ret, ptr, sizeof(xxh_u64x2));
				2454	# if XXH_VSX_BE
				2455	ret = XXH_vec_revb(ret);
				2456	# endif
				2457	return ret;
				2458	}
				2459
				2460	/*
				2461	* vec_mulo and vec_mule are very problematic intrinsics on PowerPC
				2462	*
				2463	* These intrinsics weren't added until GCC 8, despite existing for a while,
				2464	* and they are endian dependent. Also, their meaning swap depending on version.
				2465	* */
				2466	# if defined(__s390x__)
				2467	/* s390x is always big endian, no issue on this platform */
				2468	# define XXH_vec_mulo vec_mulo
				2469	# define XXH_vec_mule vec_mule
				2470	# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
				2471	/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
				2472	# define XXH_vec_mulo __builtin_altivec_vmulouw
				2473	# define XXH_vec_mule __builtin_altivec_vmuleuw
				2474	# else
				2475	/* gcc needs inline assembly */
				2476	/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
				2477	XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
				2478	{
				2479	xxh_u64x2 result;
				2480	__asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
				2481	return result;
				2482	}
				2483	XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
				2484	{
				2485	xxh_u64x2 result;
				2486	__asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
				2487	return result;
				2488	}
				2489	# endif /* XXH_vec_mulo, XXH_vec_mule */
				2490	#endif /* XXH_VECTOR == XXH_VSX */
				2491
				2492
				2493	/* prefetch
				2494	* can be disabled, by declaring XXH_NO_PREFETCH build macro */
				2495	#if defined(XXH_NO_PREFETCH)
				2496	# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
				2497	#else
				2498	# if defined(_MSC_VER) && (defined(_M_X64) \|\| defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
				2499	# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
				2500	# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
				2501	# elif defined(__GNUC__) && ( (__GNUC__ >= 4) \|\| ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
				2502	# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read /, 3 / locality */)
				2503	# else
				2504	# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
				2505	# endif
				2506	#endif /* XXH_NO_PREFETCH */
				2507
				2508
				2509	/* ==========================================
				2510	* XXH3 default settings
				2511	* ========================================== */
				2512
				2513	#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
				2514
				2515	#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
				2516	# error "default keyset is not large enough"
				2517	#endif
				2518
				2519	/* Pseudorandom secret taken directly from FARSH */
				2520	XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
				2521	0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
				2522	0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
				2523	0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
				2524	0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
				2525	0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
				2526	0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
				2527	0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
				2528	0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
				2529	0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
				2530	0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
				2531	0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
				2532	0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
				2533	};
				2534
				2535
				2536	#ifdef XXH_OLD_NAMES
				2537	# define kSecret XXH3_kSecret
				2538	#endif
				2539
				2540	/*
				2541	* Calculates a 32-bit to 64-bit long multiply.
				2542	*
				2543	* Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
				2544	* need to (but it shouldn't need to anyways, it is about 7 instructions to do
				2545	* a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
				2546	* use that instead of the normal method.
				2547	*
				2548	* If you are compiling for platforms like Thumb-1 and don't have a better option,
				2549	* you may also want to write your own long multiply routine here.
				2550	*
				2551	* XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
				2552	* {
				2553	* return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
				2554	* }
				2555	*/
				2556	#if defined(_MSC_VER) && defined(_M_IX86)
				2557	# include <intrin.h>
				2558	# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
				2559	#else
				2560	/*
				2561	* Downcast + upcast is usually better than masking on older compilers like
				2562	* GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
				2563	*
				2564	* The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
				2565	* and perform a full 64x64 multiply -- entirely redundant on 32-bit.
				2566	*/
				2567	# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
				2568	#endif
				2569
				2570	/*
				2571	* Calculates a 64->128-bit long multiply.
				2572	*
				2573	* Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
				2574	*/
				2575	static XXH128_hash_t
				2576	XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
				2577	{
				2578	/*
				2579	* GCC/Clang __uint128_t method.
				2580	*
				2581	* On most 64-bit targets, GCC and Clang define a __uint128_t type.
				2582	* This is usually the best way as it usually uses a native long 64-bit
				2583	* multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
				2584	*
				2585	* Usually.
				2586	*
				2587	* Despite being a 32-bit platform, Clang (and emscripten) define this type
				2588	* despite not having the arithmetic for it. This results in a laggy
				2589	* compiler builtin call which calculates a full 128-bit multiply.
				2590	* In that case it is best to use the portable one.
				2591	* https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
				2592	*/
				2593	#if defined(__GNUC__) && !defined(__wasm__) \
				2594	&& defined(__SIZEOF_INT128__) \
				2595	\|\| (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
				2596
				2597	__uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
				2598	XXH128_hash_t r128;
				2599	r128.low64 = (xxh_u64)(product);
				2600	r128.high64 = (xxh_u64)(product >> 64);
				2601	return r128;
				2602
				2603	/*
				2604	* MSVC for x64's _umul128 method.
				2605	*
				2606	* xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
				2607	*
				2608	* This compiles to single operand MUL on x64.
				2609	*/
				2610	#elif defined(_M_X64) \|\| defined(_M_IA64)
				2611
				2612	#ifndef _MSC_VER
				2613	# pragma intrinsic(_umul128)
				2614	#endif
				2615	xxh_u64 product_high;
				2616	xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
				2617	XXH128_hash_t r128;
				2618	r128.low64 = product_low;
				2619	r128.high64 = product_high;
				2620	return r128;
				2621
				2622	#else
				2623	/*
				2624	* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
				2625	*
				2626	* This is a fast and simple grade school multiply, which is shown below
				2627	* with base 10 arithmetic instead of base 0x100000000.
				2628	*
				2629	* 9 3 // D2 lhs = 93
				2630	* x 7 5 // D2 rhs = 75
				2631	* ----------
				2632	* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
				2633	* 4 5 \| // D2 hi_lo = (93 / 10) * (75 % 10) = 45
				2634	* 2 1 \| // D2 lo_hi = (93 % 10) * (75 / 10) = 21
				2635	* + 6 3 \| \| // D2 hi_hi = (93 / 10) * (75 / 10) = 63
				2636	* ---------
				2637	* 2 7 \| // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
				2638	* + 6 7 \| \| // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
				2639	* ---------
				2640	* 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
				2641	*
				2642	* The reasons for adding the products like this are:
				2643	* 1. It avoids manual carry tracking. Just like how
				2644	* (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
				2645	* This avoids a lot of complexity.
				2646	*
				2647	* 2. It hints for, and on Clang, compiles to, the powerful UMAAL
				2648	* instruction available in ARM's Digital Signal Processing extension
				2649	* in 32-bit ARMv6 and later, which is shown below:
				2650	*
				2651	* void UMAAL(xxh_u32 RdLo, xxh_u32 RdHi, xxh_u32 Rn, xxh_u32 Rm)
				2652	* {
				2653	* xxh_u64 product = (xxh_u64)RdLo (xxh_u64)*RdHi + Rn + Rm;
				2654	* *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
				2655	* *RdHi = (xxh_u32)(product >> 32);
				2656	* }
				2657	*
				2658	* This instruction was designed for efficient long multiplication, and
				2659	* allows this to be calculated in only 4 instructions at speeds
				2660	* comparable to some 64-bit ALUs.
				2661	*
				2662	* 3. It isn't terrible on other platforms. Usually this will be a couple
				2663	* of 32-bit ADD/ADCs.
				2664	*/
				2665
				2666	/* First calculate all of the cross products. */
				2667	xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
				2668	xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
				2669	xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
				2670	xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
				2671
				2672	/* Now add the products together. These will never overflow. */
				2673	xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
				2674	xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
				2675	xxh_u64 const lower = (cross << 32) \| (lo_lo & 0xFFFFFFFF);
				2676
				2677	XXH128_hash_t r128;
				2678	r128.low64 = lower;
				2679	r128.high64 = upper;
				2680	return r128;
				2681	#endif
				2682	}
				2683
				2684	/*
				2685	* Does a 64-bit to 128-bit multiply, then XOR folds it.
				2686	*
				2687	* The reason for the separate function is to prevent passing too many structs
				2688	* around by value. This will hopefully inline the multiply, but we don't force it.
				2689	*/
				2690	static xxh_u64
				2691	XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
				2692	{
				2693	XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
				2694	return product.low64 ^ product.high64;
				2695	}
				2696
				2697	/* Seems to produce slightly better code on GCC for some reason. */
				2698	XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
				2699	{
				2700	XXH_ASSERT(0 <= shift && shift < 64);
				2701	return v64 ^ (v64 >> shift);
				2702	}
				2703
				2704	/*
				2705	* This is a fast avalanche stage,
				2706	* suitable when input bits are already partially mixed
				2707	*/
				2708	static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
				2709	{
				2710	h64 = XXH_xorshift64(h64, 37);
				2711	h64 *= 0x165667919E3779F9ULL;
				2712	h64 = XXH_xorshift64(h64, 32);
				2713	return h64;
				2714	}
				2715
				2716	/*
				2717	* This is a stronger avalanche,
				2718	* inspired by Pelle Evensen's rrmxmx
				2719	* preferable when input has not been previously mixed
				2720	*/
				2721	static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
				2722	{
				2723	/* this mix is inspired by Pelle Evensen's rrmxmx */
				2724	h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
				2725	h64 *= 0x9FB21C651E98DF25ULL;
				2726	h64 ^= (h64 >> 35) + len ;
				2727	h64 *= 0x9FB21C651E98DF25ULL;
				2728	return XXH_xorshift64(h64, 28);
				2729	}
				2730
				2731
				2732	/* ==========================================
				2733	* Short keys
				2734	* ==========================================
				2735	* One of the shortcomings of XXH32 and XXH64 was that their performance was
				2736	* sub-optimal on short lengths. It used an iterative algorithm which strongly
				2737	* favored lengths that were a multiple of 4 or 8.
				2738	*
				2739	* Instead of iterating over individual inputs, we use a set of single shot
				2740	* functions which piece together a range of lengths and operate in constant time.
				2741	*
				2742	* Additionally, the number of multiplies has been significantly reduced. This
				2743	* reduces latency, especially when emulating 64-bit multiplies on 32-bit.
				2744	*
				2745	* Depending on the platform, this may or may not be faster than XXH32, but it
				2746	* is almost guaranteed to be faster than XXH64.
				2747	*/
				2748
				2749	/*
				2750	* At very short lengths, there isn't enough input to fully hide secrets, or use
				2751	* the entire secret.
				2752	*
				2753	* There is also only a limited amount of mixing we can do before significantly
				2754	* impacting performance.
				2755	*
				2756	* Therefore, we use different sections of the secret and always mix two secret
				2757	* samples with an XOR. This should have no effect on performance on the
				2758	* seedless or withSeed variants because everything _should_ be constant folded
				2759	* by modern compilers.
				2760	*
				2761	* The XOR mixing hides individual parts of the secret and increases entropy.
				2762	*
				2763	* This adds an extra layer of strength for custom secrets.
				2764	*/
				2765	XXH_FORCE_INLINE XXH64_hash_t
				2766	XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				2767	{
				2768	XXH_ASSERT(input != NULL);
				2769	XXH_ASSERT(1 <= len && len <= 3);
				2770	XXH_ASSERT(secret != NULL);
				2771	/*
				2772	* len = 1: combined = { input[0], 0x01, input[0], input[0] }
				2773	* len = 2: combined = { input[1], 0x02, input[0], input[1] }
				2774	* len = 3: combined = { input[2], 0x03, input[0], input[1] }
				2775	*/
				2776	{ xxh_u8 const c1 = input[0];
				2777	xxh_u8 const c2 = input[len >> 1];
				2778	xxh_u8 const c3 = input[len - 1];
				2779	xxh_u32 const combined = ((xxh_u32)c1 << 16) \| ((xxh_u32)c2 << 24)
				2780	\| ((xxh_u32)c3 << 0) \| ((xxh_u32)len << 8);
				2781	xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
				2782	xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
				2783	return XXH64_avalanche(keyed);
				2784	}
				2785	}
				2786
				2787	XXH_FORCE_INLINE XXH64_hash_t
				2788	XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				2789	{
				2790	XXH_ASSERT(input != NULL);
				2791	XXH_ASSERT(secret != NULL);
				2792	XXH_ASSERT(4 <= len && len < 8);
				2793	seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
				2794	{ xxh_u32 const input1 = XXH_readLE32(input);
				2795	xxh_u32 const input2 = XXH_readLE32(input + len - 4);
				2796	xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
				2797	xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
				2798	xxh_u64 const keyed = input64 ^ bitflip;
				2799	return XXH3_rrmxmx(keyed, len);
				2800	}
				2801	}
				2802
				2803	XXH_FORCE_INLINE XXH64_hash_t
				2804	XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				2805	{
				2806	XXH_ASSERT(input != NULL);
				2807	XXH_ASSERT(secret != NULL);
				2808	XXH_ASSERT(8 <= len && len <= 16);
				2809	{ xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
				2810	xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
				2811	xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
				2812	xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
				2813	xxh_u64 const acc = len
				2814	+ XXH_swap64(input_lo) + input_hi
				2815	+ XXH3_mul128_fold64(input_lo, input_hi);
				2816	return XXH3_avalanche(acc);
				2817	}
				2818	}
				2819
				2820	XXH_FORCE_INLINE XXH64_hash_t
				2821	XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				2822	{
				2823	XXH_ASSERT(len <= 16);
				2824	{ if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed);
				2825	if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
				2826	if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
				2827	return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
				2828	}
				2829	}
				2830
				2831	/*
				2832	* DISCLAIMER: There are known seed-dependent multicollisions here due to
				2833	* multiplication by zero, affecting hashes of lengths 17 to 240.
				2834	*
				2835	* However, they are very unlikely.
				2836	*
				2837	* Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
				2838	* unseeded non-cryptographic hashes, it does not attempt to defend itself
				2839	* against specially crafted inputs, only random inputs.
				2840	*
				2841	* Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
				2842	* cancelling out the secret is taken an arbitrary number of times (addressed
				2843	* in XXH3_accumulate_512), this collision is very unlikely with random inputs
				2844	* and/or proper seeding:
				2845	*
				2846	* This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
				2847	* function that is only called up to 16 times per hash with up to 240 bytes of
				2848	* input.
				2849	*
				2850	* This is not too bad for a non-cryptographic hash function, especially with
				2851	* only 64 bit outputs.
				2852	*
				2853	* The 128-bit variant (which trades some speed for strength) is NOT affected
				2854	* by this, although it is always a good idea to use a proper seed if you care
				2855	* about strength.
				2856	*/
				2857	XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
				2858	const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
				2859	{
				2860	#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
				2861	&& defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \
				2862	&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */
				2863	/*
				2864	* UGLY HACK:
				2865	* GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
				2866	* slower code.
				2867	*
				2868	* By forcing seed64 into a register, we disrupt the cost model and
				2869	* cause it to scalarize. See `XXH32_round()`
				2870	*
				2871	* FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
				2872	* XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
				2873	* GCC 9.2, despite both emitting scalar code.
				2874	*
				2875	* GCC generates much better scalar code than Clang for the rest of XXH3,
				2876	* which is why finding a more optimal codepath is an interest.
				2877	*/
				2878	__asm__ ("" : "+r" (seed64));
				2879	#endif
				2880	{ xxh_u64 const input_lo = XXH_readLE64(input);
				2881	xxh_u64 const input_hi = XXH_readLE64(input+8);
				2882	return XXH3_mul128_fold64(
				2883	input_lo ^ (XXH_readLE64(secret) + seed64),
				2884	input_hi ^ (XXH_readLE64(secret+8) - seed64)
				2885	);
				2886	}
				2887	}
				2888
				2889	/* For mid range keys, XXH3 uses a Mum-hash variant. */
				2890	XXH_FORCE_INLINE XXH64_hash_t
				2891	XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
				2892	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				2893	XXH64_hash_t seed)
				2894	{
				2895	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
				2896	XXH_ASSERT(16 < len && len <= 128);
				2897
				2898	{ xxh_u64 acc = len * XXH_PRIME64_1;
				2899	if (len > 32) {
				2900	if (len > 64) {
				2901	if (len > 96) {
				2902	acc += XXH3_mix16B(input+48, secret+96, seed);
				2903	acc += XXH3_mix16B(input+len-64, secret+112, seed);
				2904	}
				2905	acc += XXH3_mix16B(input+32, secret+64, seed);
				2906	acc += XXH3_mix16B(input+len-48, secret+80, seed);
				2907	}
				2908	acc += XXH3_mix16B(input+16, secret+32, seed);
				2909	acc += XXH3_mix16B(input+len-32, secret+48, seed);
				2910	}
				2911	acc += XXH3_mix16B(input+0, secret+0, seed);
				2912	acc += XXH3_mix16B(input+len-16, secret+16, seed);
				2913
				2914	return XXH3_avalanche(acc);
				2915	}
				2916	}
				2917
				2918	#define XXH3_MIDSIZE_MAX 240
				2919
				2920	XXH_NO_INLINE XXH64_hash_t
				2921	XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
				2922	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				2923	XXH64_hash_t seed)
				2924	{
				2925	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
				2926	XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
				2927
				2928	#define XXH3_MIDSIZE_STARTOFFSET 3
				2929	#define XXH3_MIDSIZE_LASTOFFSET 17
				2930
				2931	{ xxh_u64 acc = len * XXH_PRIME64_1;
				2932	int const nbRounds = (int)len / 16;
				2933	int i;
				2934	for (i=0; i<8; i++) {
				2935	acc += XXH3_mix16B(input+(16i), secret+(16i), seed);
				2936	}
				2937	acc = XXH3_avalanche(acc);
				2938	XXH_ASSERT(nbRounds >= 8);
				2939	#if defined(__clang__) /* Clang */ \
				2940	&& (defined(__ARM_NEON) \|\| defined(__ARM_NEON__)) /* NEON */ \
				2941	&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
				2942	/*
				2943	* UGLY HACK:
				2944	* Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
				2945	* In everywhere else, it uses scalar code.
				2946	*
				2947	* For 64->128-bit multiplies, even if the NEON was 100% optimal, it
				2948	* would still be slower than UMAAL (see XXH_mult64to128).
				2949	*
				2950	* Unfortunately, Clang doesn't handle the long multiplies properly and
				2951	* converts them to the nonexistent "vmulq_u64" intrinsic, which is then
				2952	* scalarized into an ugly mess of VMOV.32 instructions.
				2953	*
				2954	* This mess is difficult to avoid without turning autovectorization
				2955	* off completely, but they are usually relatively minor and/or not
				2956	* worth it to fix.
				2957	*
				2958	* This loop is the easiest to fix, as unlike XXH32, this pragma
				2959	* _actually works_ because it is a loop vectorization instead of an
				2960	* SLP vectorization.
				2961	*/
				2962	#pragma clang loop vectorize(disable)
				2963	#endif
				2964	for (i=8 ; i < nbRounds; i++) {
				2965	acc += XXH3_mix16B(input+(16i), secret+(16(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
				2966	}
				2967	/* last bytes */
				2968	acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
				2969	return XXH3_avalanche(acc);
				2970	}
				2971	}
				2972
				2973
				2974	/* ======= Long Keys ======= */
				2975
				2976	#define XXH_STRIPE_LEN 64
				2977	#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
				2978	#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
				2979
				2980	#ifdef XXH_OLD_NAMES
				2981	# define STRIPE_LEN XXH_STRIPE_LEN
				2982	# define ACC_NB XXH_ACC_NB
				2983	#endif
				2984
				2985	XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
				2986	{
				2987	if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
				2988	memcpy(dst, &v64, sizeof(v64));
				2989	}
				2990
				2991	/* Several intrinsic functions below are supposed to accept __int64 as argument,
				2992	* as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
				2993	* However, several environments do not define __int64 type,
				2994	* requiring a workaround.
				2995	*/
				2996	#if !defined (__VMS) \
				2997	&& (defined (__cplusplus) \
				2998	\|\| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
				2999	typedef int64_t xxh_i64;
				3000	#else
				3001	/* the following type must have a width of 64-bit */
				3002	typedef long long xxh_i64;
				3003	#endif
				3004
				3005	/*
				3006	* XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
				3007	*
				3008	* It is a hardened version of UMAC, based off of FARSH's implementation.
				3009	*
				3010	* This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
				3011	* implementations, and it is ridiculously fast.
				3012	*
				3013	* We harden it by mixing the original input to the accumulators as well as the product.
				3014	*
				3015	* This means that in the (relatively likely) case of a multiply by zero, the
				3016	* original input is preserved.
				3017	*
				3018	* On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
				3019	* cross-pollination, as otherwise the upper and lower halves would be
				3020	* essentially independent.
				3021	*
				3022	* This doesn't matter on 64-bit hashes since they all get merged together in
				3023	* the end, so we skip the extra step.
				3024	*
				3025	* Both XXH3_64bits and XXH3_128bits use this subroutine.
				3026	*/
				3027
				3028	#if (XXH_VECTOR == XXH_AVX512) \|\| defined(XXH_X86DISPATCH)
				3029
				3030	#ifndef XXH_TARGET_AVX512
				3031	# define XXH_TARGET_AVX512 /* disable attribute target */
				3032	#endif
				3033
				3034	XXH_FORCE_INLINE XXH_TARGET_AVX512 void
				3035	XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
				3036	const void* XXH_RESTRICT input,
				3037	const void* XXH_RESTRICT secret)
				3038	{
				3039	XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;
				3040	XXH_ASSERT((((size_t)acc) & 63) == 0);
				3041	XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
				3042
				3043	{
				3044	/* data_vec = input[0]; */
				3045	__m512i const data_vec = _mm512_loadu_si512 (input);
				3046	/* key_vec = secret[0]; */
				3047	__m512i const key_vec = _mm512_loadu_si512 (secret);
				3048	/* data_key = data_vec ^ key_vec; */
				3049	__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
				3050	/* data_key_lo = data_key >> 32; */
				3051	__m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
				3052	/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
				3053	__m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);
				3054	/* xacc[0] += swap(data_vec); */
				3055	__m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
				3056	__m512i const sum = _mm512_add_epi64(*xacc, data_swap);
				3057	/* xacc[0] += product; */
				3058	*xacc = _mm512_add_epi64(product, sum);
				3059	}
				3060	}
				3061
				3062	/*
				3063	* XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
				3064	*
				3065	* Multiplication isn't perfect, as explained by Google in HighwayHash:
				3066	*
				3067	* // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
				3068	* // varying degrees. In descending order of goodness, bytes
				3069	* // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
				3070	* // As expected, the upper and lower bytes are much worse.
				3071	*
				3072	* Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
				3073	*
				3074	* Since our algorithm uses a pseudorandom secret to add some variance into the
				3075	* mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
				3076	*
				3077	* This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
				3078	* extraction.
				3079	*
				3080	* Both XXH3_64bits and XXH3_128bits use this subroutine.
				3081	*/
				3082
				3083	XXH_FORCE_INLINE XXH_TARGET_AVX512 void
				3084	XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3085	{
				3086	XXH_ASSERT((((size_t)acc) & 63) == 0);
				3087	XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
				3088	{ XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
				3089	const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
				3090
				3091	/* xacc[0] ^= (xacc[0] >> 47) */
				3092	__m512i const acc_vec = *xacc;
				3093	__m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);
				3094	__m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);
				3095	/* xacc[0] ^= secret; */
				3096	__m512i const key_vec = _mm512_loadu_si512 (secret);
				3097	__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
				3098
				3099	/* xacc[0] = XXH_PRIME32_1; /
				3100	__m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
				3101	__m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);
				3102	__m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);
				3103	*xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
				3104	}
				3105	}
				3106
				3107	XXH_FORCE_INLINE XXH_TARGET_AVX512 void
				3108	XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
				3109	{
				3110	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
				3111	XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
				3112	XXH_ASSERT(((size_t)customSecret & 63) == 0);
				3113	(void)(&XXH_writeLE64);
				3114	{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
				3115	__m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, -(xxh_i64)seed64);
				3116
				3117	XXH_ALIGN(64) const __m512i* const src = (const __m512i*) XXH3_kSecret;
				3118	XXH_ALIGN(64) __m512i* const dest = ( __m512i*) customSecret;
				3119	int i;
				3120	for (i=0; i < nbRounds; ++i) {
				3121	/* GCC has a bug, _mm512_stream_load_si512 accepts 'void', not 'void const',
				3122	* this will warn "discards ‘const’ qualifier". */
				3123	union {
				3124	XXH_ALIGN(64) const __m512i* cp;
				3125	XXH_ALIGN(64) void* p;
				3126	} remote_const_void;
				3127	remote_const_void.cp = src + i;
				3128	dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
				3129	} }
				3130	}
				3131
				3132	#endif
				3133
				3134	#if (XXH_VECTOR == XXH_AVX2) \|\| defined(XXH_X86DISPATCH)
				3135
				3136	#ifndef XXH_TARGET_AVX2
				3137	# define XXH_TARGET_AVX2 /* disable attribute target */
				3138	#endif
				3139
				3140	XXH_FORCE_INLINE XXH_TARGET_AVX2 void
				3141	XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
				3142	const void* XXH_RESTRICT input,
				3143	const void* XXH_RESTRICT secret)
				3144	{
				3145	XXH_ASSERT((((size_t)acc) & 31) == 0);
				3146	{ XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
				3147	/* Unaligned. This is mainly for pointer arithmetic, and because
				3148	* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
				3149	const __m256i* const xinput = (const __m256i *) input;
				3150	/* Unaligned. This is mainly for pointer arithmetic, and because
				3151	* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
				3152	const __m256i* const xsecret = (const __m256i *) secret;
				3153
				3154	size_t i;
				3155	for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
				3156	/* data_vec = xinput[i]; */
				3157	__m256i const data_vec = _mm256_loadu_si256 (xinput+i);
				3158	/* key_vec = xsecret[i]; */
				3159	__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
				3160	/* data_key = data_vec ^ key_vec; */
				3161	__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
				3162	/* data_key_lo = data_key >> 32; */
				3163	__m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
				3164	/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
				3165	__m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);
				3166	/* xacc[i] += swap(data_vec); */
				3167	__m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
				3168	__m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
				3169	/* xacc[i] += product; */
				3170	xacc[i] = _mm256_add_epi64(product, sum);
				3171	} }
				3172	}
				3173
				3174	XXH_FORCE_INLINE XXH_TARGET_AVX2 void
				3175	XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3176	{
				3177	XXH_ASSERT((((size_t)acc) & 31) == 0);
				3178	{ XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
				3179	/* Unaligned. This is mainly for pointer arithmetic, and because
				3180	* _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
				3181	const __m256i* const xsecret = (const __m256i *) secret;
				3182	const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
				3183
				3184	size_t i;
				3185	for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
				3186	/* xacc[i] ^= (xacc[i] >> 47) */
				3187	__m256i const acc_vec = xacc[i];
				3188	__m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);
				3189	__m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);
				3190	/* xacc[i] ^= xsecret; */
				3191	__m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
				3192	__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
				3193
				3194	/* xacc[i] = XXH_PRIME32_1; /
				3195	__m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
				3196	__m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
				3197	__m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
				3198	xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
				3199	}
				3200	}
				3201	}
				3202
				3203	XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
				3204	{
				3205	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
				3206	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
				3207	XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
				3208	(void)(&XXH_writeLE64);
				3209	XXH_PREFETCH(customSecret);
				3210	{ __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, -(xxh_i64)seed64, (xxh_i64)seed64);
				3211
				3212	XXH_ALIGN(64) const __m256i* const src = (const __m256i*) XXH3_kSecret;
				3213	XXH_ALIGN(64) __m256i* dest = ( __m256i*) customSecret;
				3214
				3215	# if defined(__GNUC__) \|\| defined(__clang__)
				3216	/*
				3217	* On GCC & Clang, marking 'dest' as modified will cause the compiler:
				3218	* - do not extract the secret from sse registers in the internal loop
				3219	* - use less common registers, and avoid pushing these reg into stack
				3220	* The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
				3221	* customSecret, and on aarch64, this prevented LDP from merging two
				3222	* loads together for free. Putting the loads together before the stores
				3223	* properly generates LDP.
				3224	*/
				3225	__asm__("" : "+r" (dest));
				3226	# endif
				3227
				3228	/* GCC -O2 need unroll loop manually */
				3229	dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
				3230	dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
				3231	dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
				3232	dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
				3233	dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
				3234	dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
				3235	}
				3236	}
				3237
				3238	#endif
				3239
				3240	#if (XXH_VECTOR == XXH_SSE2) \|\| defined(XXH_X86DISPATCH)
				3241
				3242	#ifndef XXH_TARGET_SSE2
				3243	# define XXH_TARGET_SSE2 /* disable attribute target */
				3244	#endif
				3245
				3246	XXH_FORCE_INLINE XXH_TARGET_SSE2 void
				3247	XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
				3248	const void* XXH_RESTRICT input,
				3249	const void* XXH_RESTRICT secret)
				3250	{
				3251	/* SSE2 is just a half-scale version of the AVX2 version. */
				3252	XXH_ASSERT((((size_t)acc) & 15) == 0);
				3253	{ XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
				3254	/* Unaligned. This is mainly for pointer arithmetic, and because
				3255	* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
				3256	const __m128i* const xinput = (const __m128i *) input;
				3257	/* Unaligned. This is mainly for pointer arithmetic, and because
				3258	* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
				3259	const __m128i* const xsecret = (const __m128i *) secret;
				3260
				3261	size_t i;
				3262	for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
				3263	/* data_vec = xinput[i]; */
				3264	__m128i const data_vec = _mm_loadu_si128 (xinput+i);
				3265	/* key_vec = xsecret[i]; */
				3266	__m128i const key_vec = _mm_loadu_si128 (xsecret+i);
				3267	/* data_key = data_vec ^ key_vec; */
				3268	__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
				3269	/* data_key_lo = data_key >> 32; */
				3270	__m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
				3271	/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
				3272	__m128i const product = _mm_mul_epu32 (data_key, data_key_lo);
				3273	/* xacc[i] += swap(data_vec); */
				3274	__m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
				3275	__m128i const sum = _mm_add_epi64(xacc[i], data_swap);
				3276	/* xacc[i] += product; */
				3277	xacc[i] = _mm_add_epi64(product, sum);
				3278	} }
				3279	}
				3280
				3281	XXH_FORCE_INLINE XXH_TARGET_SSE2 void
				3282	XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3283	{
				3284	XXH_ASSERT((((size_t)acc) & 15) == 0);
				3285	{ XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
				3286	/* Unaligned. This is mainly for pointer arithmetic, and because
				3287	* _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
				3288	const __m128i* const xsecret = (const __m128i *) secret;
				3289	const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
				3290
				3291	size_t i;
				3292	for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
				3293	/* xacc[i] ^= (xacc[i] >> 47) */
				3294	__m128i const acc_vec = xacc[i];
				3295	__m128i const shifted = _mm_srli_epi64 (acc_vec, 47);
				3296	__m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);
				3297	/* xacc[i] ^= xsecret[i]; */
				3298	__m128i const key_vec = _mm_loadu_si128 (xsecret+i);
				3299	__m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
				3300
				3301	/* xacc[i] = XXH_PRIME32_1; /
				3302	__m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
				3303	__m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);
				3304	__m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);
				3305	xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
				3306	}
				3307	}
				3308	}
				3309
				3310	XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
				3311	{
				3312	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
				3313	(void)(&XXH_writeLE64);
				3314	{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
				3315
				3316	# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
				3317	// MSVC 32bit mode does not support _mm_set_epi64x before 2015
				3318	XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, -(xxh_i64)seed64 };
				3319	__m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
				3320	# else
				3321	__m128i const seed = _mm_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64);
				3322	# endif
				3323	int i;
				3324
				3325	XXH_ALIGN(64) const float* const src = (float const*) XXH3_kSecret;
				3326	XXH_ALIGN(XXH_SEC_ALIGN) __m128i* dest = (__m128i*) customSecret;
				3327	# if defined(__GNUC__) \|\| defined(__clang__)
				3328	/*
				3329	* On GCC & Clang, marking 'dest' as modified will cause the compiler:
				3330	* - do not extract the secret from sse registers in the internal loop
				3331	* - use less common registers, and avoid pushing these reg into stack
				3332	*/
				3333	__asm__("" : "+r" (dest));
				3334	# endif
				3335
				3336	for (i=0; i < nbRounds; ++i) {
				3337	dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed);
				3338	} }
				3339	}
				3340
				3341	#endif
				3342
				3343	#if (XXH_VECTOR == XXH_NEON)
				3344
				3345	XXH_FORCE_INLINE void
				3346	XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
				3347	const void* XXH_RESTRICT input,
				3348	const void* XXH_RESTRICT secret)
				3349	{
				3350	XXH_ASSERT((((size_t)acc) & 15) == 0);
				3351	{
				3352	XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
				3353	/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
				3354	uint8_t const* const xinput = (const uint8_t *) input;
				3355	uint8_t const* const xsecret = (const uint8_t *) secret;
				3356
				3357	size_t i;
				3358	for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
				3359	/* data_vec = xinput[i]; */
				3360	uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
				3361	/* key_vec = xsecret[i]; */
				3362	uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
				3363	uint64x2_t data_key;
				3364	uint32x2_t data_key_lo, data_key_hi;
				3365	/* xacc[i] += swap(data_vec); */
				3366	uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
				3367	uint64x2_t const swapped = vextq_u64(data64, data64, 1);
				3368	xacc[i] = vaddq_u64 (xacc[i], swapped);
				3369	/* data_key = data_vec ^ key_vec; */
				3370	data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
				3371	/* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
				3372	* data_key_hi = (uint32x2_t) (data_key >> 32);
				3373	* data_key = UNDEFINED; */
				3374	XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
				3375	/* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
				3376	xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
				3377
				3378	}
				3379	}
				3380	}
				3381
				3382	XXH_FORCE_INLINE void
				3383	XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3384	{
				3385	XXH_ASSERT((((size_t)acc) & 15) == 0);
				3386
				3387	{ uint64x2_t* xacc = (uint64x2_t*) acc;
				3388	uint8_t const* xsecret = (uint8_t const*) secret;
				3389	uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);
				3390
				3391	size_t i;
				3392	for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
				3393	/* xacc[i] ^= (xacc[i] >> 47); */
				3394	uint64x2_t acc_vec = xacc[i];
				3395	uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
				3396	uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
				3397
				3398	/* xacc[i] ^= xsecret[i]; */
				3399	uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
				3400	uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
				3401
				3402	/* xacc[i] = XXH_PRIME32_1 /
				3403	uint32x2_t data_key_lo, data_key_hi;
				3404	/* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
				3405	* data_key_hi = (uint32x2_t) (xacc[i] >> 32);
				3406	* xacc[i] = UNDEFINED; */
				3407	XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
				3408	{ /*
				3409	* prod_hi = (data_key >> 32) * XXH_PRIME32_1;
				3410	*
				3411	* Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
				3412	* incorrectly "optimize" this:
				3413	* tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
				3414	* shifted = vshll_n_u32(tmp, 32);
				3415	* to this:
				3416	* tmp = "vmulq_u64"(a, b); // no such thing!
				3417	* shifted = vshlq_n_u64(tmp, 32);
				3418	*
				3419	* However, unlike SSE, Clang lacks a 64-bit multiply routine
				3420	* for NEON, and it scalarizes two 64-bit multiplies instead.
				3421	*
				3422	* vmull_u32 has the same timing as vmul_u32, and it avoids
				3423	* this bug completely.
				3424	* See https://bugs.llvm.org/show_bug.cgi?id=39967
				3425	*/
				3426	uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
				3427	/* xacc[i] = prod_hi << 32; */
				3428	xacc[i] = vshlq_n_u64(prod_hi, 32);
				3429	/* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
				3430	xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
				3431	}
				3432	} }
				3433	}
				3434
				3435	#endif
				3436
				3437	#if (XXH_VECTOR == XXH_VSX)
				3438
				3439	XXH_FORCE_INLINE void
				3440	XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
				3441	const void* XXH_RESTRICT input,
				3442	const void* XXH_RESTRICT secret)
				3443	{
				3444	xxh_u64x2* const xacc = (xxh_u64x2) acc; / presumed aligned */
				3445	xxh_u64x2 const* const xinput = (xxh_u64x2 const) input; / no alignment restriction */
				3446	xxh_u64x2 const* const xsecret = (xxh_u64x2 const) secret; / no alignment restriction */
				3447	xxh_u64x2 const v32 = { 32, 32 };
				3448	size_t i;
				3449	for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
				3450	/* data_vec = xinput[i]; */
				3451	xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
				3452	/* key_vec = xsecret[i]; */
				3453	xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
				3454	xxh_u64x2 const data_key = data_vec ^ key_vec;
				3455	/* shuffled = (data_key << 32) \| (data_key >> 32); */
				3456	xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
				3457	/* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
				3458	xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
				3459	xacc[i] += product;
				3460
				3461	/* swap high and low halves */
				3462	#ifdef __s390x__
				3463	xacc[i] += vec_permi(data_vec, data_vec, 2);
				3464	#else
				3465	xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
				3466	#endif
				3467	}
				3468	}
				3469
				3470	XXH_FORCE_INLINE void
				3471	XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3472	{
				3473	XXH_ASSERT((((size_t)acc) & 15) == 0);
				3474
				3475	{ xxh_u64x2* const xacc = (xxh_u64x2*) acc;
				3476	const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
				3477	/* constants */
				3478	xxh_u64x2 const v32 = { 32, 32 };
				3479	xxh_u64x2 const v47 = { 47, 47 };
				3480	xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
				3481	size_t i;
				3482	for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
				3483	/* xacc[i] ^= (xacc[i] >> 47); */
				3484	xxh_u64x2 const acc_vec = xacc[i];
				3485	xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
				3486
				3487	/* xacc[i] ^= xsecret[i]; */
				3488	xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
				3489	xxh_u64x2 const data_key = data_vec ^ key_vec;
				3490
				3491	/* xacc[i] = XXH_PRIME32_1 /
				3492	/* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */
				3493	xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
				3494	/* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */
				3495	xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
				3496	xacc[i] = prod_odd + (prod_even << v32);
				3497	} }
				3498	}
				3499
				3500	#endif
				3501
				3502	/* scalar variants - universal */
				3503
				3504	XXH_FORCE_INLINE void
				3505	XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
				3506	const void* XXH_RESTRICT input,
				3507	const void* XXH_RESTRICT secret)
				3508	{
				3509	XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64) acc; / presumed aligned */
				3510	const xxh_u8* const xinput = (const xxh_u8) input; / no alignment restriction */
				3511	const xxh_u8* const xsecret = (const xxh_u8) secret; / no alignment restriction */
				3512	size_t i;
				3513	XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
				3514	for (i=0; i < XXH_ACC_NB; i++) {
				3515	xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
				3516	xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
				3517	xacc[i ^ 1] += data_val; /* swap adjacent lanes */
				3518	xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
				3519	}
				3520	}
				3521
				3522	XXH_FORCE_INLINE void
				3523	XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
				3524	{
				3525	XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64) acc; / presumed aligned */
				3526	const xxh_u8* const xsecret = (const xxh_u8) secret; / no alignment restriction */
				3527	size_t i;
				3528	XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
				3529	for (i=0; i < XXH_ACC_NB; i++) {
				3530	xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
				3531	xxh_u64 acc64 = xacc[i];
				3532	acc64 = XXH_xorshift64(acc64, 47);
				3533	acc64 ^= key64;
				3534	acc64 *= XXH_PRIME32_1;
				3535	xacc[i] = acc64;
				3536	}
				3537	}
				3538
				3539	XXH_FORCE_INLINE void
				3540	XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
				3541	{
				3542	/*
				3543	* We need a separate pointer for the hack below,
				3544	* which requires a non-const pointer.
				3545	* Any decent compiler will optimize this out otherwise.
				3546	*/
				3547	const xxh_u8* kSecretPtr = XXH3_kSecret;
				3548	XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
				3549
				3550	#if defined(__clang__) && defined(__aarch64__)
				3551	/*
				3552	* UGLY HACK:
				3553	* Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
				3554	* placed sequentially, in order, at the top of the unrolled loop.
				3555	*
				3556	* While MOVK is great for generating constants (2 cycles for a 64-bit
				3557	* constant compared to 4 cycles for LDR), long MOVK chains stall the
				3558	* integer pipelines:
				3559	* I L S
				3560	* MOVK
				3561	* MOVK
				3562	* MOVK
				3563	* MOVK
				3564	* ADD
				3565	* SUB STR
				3566	* STR
				3567	* By forcing loads from memory (as the asm line causes Clang to assume
				3568	* that XXH3_kSecretPtr has been changed), the pipelines are used more
				3569	* efficiently:
				3570	* I L S
				3571	* LDR
				3572	* ADD LDR
				3573	* SUB STR
				3574	* STR
				3575	* XXH3_64bits_withSeed, len == 256, Snapdragon 835
				3576	* without hack: 2654.4 MB/s
				3577	* with hack: 3202.9 MB/s
				3578	*/
				3579	__asm__("" : "+r" (kSecretPtr));
				3580	#endif
				3581	/*
				3582	* Note: in debug mode, this overrides the asm optimization
				3583	* and Clang will emit MOVK chains again.
				3584	*/
				3585	XXH_ASSERT(kSecretPtr == XXH3_kSecret);
				3586
				3587	{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
				3588	int i;
				3589	for (i=0; i < nbRounds; i++) {
				3590	/*
				3591	* The asm hack causes Clang to assume that kSecretPtr aliases with
				3592	* customSecret, and on aarch64, this prevented LDP from merging two
				3593	* loads together for free. Putting the loads together before the stores
				3594	* properly generates LDP.
				3595	*/
				3596	xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64;
				3597	xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
				3598	XXH_writeLE64((xxh_u8)customSecret + 16i, lo);
				3599	XXH_writeLE64((xxh_u8)customSecret + 16i + 8, hi);
				3600	} }
				3601	}
				3602
				3603
				3604	typedef void (XXH3_f_accumulate_512)(void XXH_RESTRICT, const void, const void);
				3605	typedef void (XXH3_f_scrambleAcc)(void XXH_RESTRICT, const void*);
				3606	typedef void (XXH3_f_initCustomSecret)(void XXH_RESTRICT, xxh_u64);
				3607
				3608
				3609	#if (XXH_VECTOR == XXH_AVX512)
				3610
				3611	#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
				3612	#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
				3613	#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
				3614
				3615	#elif (XXH_VECTOR == XXH_AVX2)
				3616
				3617	#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
				3618	#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
				3619	#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
				3620
				3621	#elif (XXH_VECTOR == XXH_SSE2)
				3622
				3623	#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
				3624	#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
				3625	#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
				3626
				3627	#elif (XXH_VECTOR == XXH_NEON)
				3628
				3629	#define XXH3_accumulate_512 XXH3_accumulate_512_neon
				3630	#define XXH3_scrambleAcc XXH3_scrambleAcc_neon
				3631	#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
				3632
				3633	#elif (XXH_VECTOR == XXH_VSX)
				3634
				3635	#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
				3636	#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
				3637	#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
				3638
				3639	#else /* scalar */
				3640
				3641	#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
				3642	#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
				3643	#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
				3644
				3645	#endif
				3646
				3647
				3648
				3649	#ifndef XXH_PREFETCH_DIST
				3650	# ifdef __clang__
				3651	# define XXH_PREFETCH_DIST 320
				3652	# else
				3653	# if (XXH_VECTOR == XXH_AVX512)
				3654	# define XXH_PREFETCH_DIST 512
				3655	# else
				3656	# define XXH_PREFETCH_DIST 384
				3657	# endif
				3658	# endif /* __clang__ */
				3659	#endif /* XXH_PREFETCH_DIST */
				3660
				3661	/*
				3662	* XXH3_accumulate()
				3663	* Loops over XXH3_accumulate_512().
				3664	* Assumption: nbStripes will not overflow the secret size
				3665	*/
				3666	XXH_FORCE_INLINE void
				3667	XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,
				3668	const xxh_u8* XXH_RESTRICT input,
				3669	const xxh_u8* XXH_RESTRICT secret,
				3670	size_t nbStripes,
				3671	XXH3_f_accumulate_512 f_acc512)
				3672	{
				3673	size_t n;
				3674	for (n = 0; n < nbStripes; n++ ) {
				3675	const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
				3676	XXH_PREFETCH(in + XXH_PREFETCH_DIST);
				3677	f_acc512(acc,
				3678	in,
				3679	secret + n*XXH_SECRET_CONSUME_RATE);
				3680	}
				3681	}
				3682
				3683	XXH_FORCE_INLINE void
				3684	XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
				3685	const xxh_u8* XXH_RESTRICT input, size_t len,
				3686	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				3687	XXH3_f_accumulate_512 f_acc512,
				3688	XXH3_f_scrambleAcc f_scramble)
				3689	{
				3690	size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
				3691	size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
				3692	size_t const nb_blocks = (len - 1) / block_len;
				3693
				3694	size_t n;
				3695
				3696	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
				3697
				3698	for (n = 0; n < nb_blocks; n++) {
				3699	XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
				3700	f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
				3701	}
				3702
				3703	/* last partial block */
				3704	XXH_ASSERT(len > XXH_STRIPE_LEN);
				3705	{ size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
				3706	XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
				3707	XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
				3708
				3709	/* last stripe */
				3710	{ const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
				3711	#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */
				3712	f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
				3713	} }
				3714	}
				3715
				3716	XXH_FORCE_INLINE xxh_u64
				3717	XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
				3718	{
				3719	return XXH3_mul128_fold64(
				3720	acc[0] ^ XXH_readLE64(secret),
				3721	acc[1] ^ XXH_readLE64(secret+8) );
				3722	}
				3723
				3724	static XXH64_hash_t
				3725	XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
				3726	{
				3727	xxh_u64 result64 = start;
				3728	size_t i = 0;
				3729
				3730	for (i = 0; i < 4; i++) {
				3731	result64 += XXH3_mix2Accs(acc+2i, secret + 16i);
				3732	#if defined(__clang__) /* Clang */ \
				3733	&& (defined(__arm__) \|\| defined(__thumb__)) /* ARMv7 */ \
				3734	&& (defined(__ARM_NEON) \|\| defined(__ARM_NEON__)) /* NEON */ \
				3735	&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
				3736	/*
				3737	* UGLY HACK:
				3738	* Prevent autovectorization on Clang ARMv7-a. Exact same problem as
				3739	* the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
				3740	* XXH3_64bits, len == 256, Snapdragon 835:
				3741	* without hack: 2063.7 MB/s
				3742	* with hack: 2560.7 MB/s
				3743	*/
				3744	__asm__("" : "+r" (result64));
				3745	#endif
				3746	}
				3747
				3748	return XXH3_avalanche(result64);
				3749	}
				3750
				3751	#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
				3752	XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
				3753
				3754	XXH_FORCE_INLINE XXH64_hash_t
				3755	XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
				3756	const void* XXH_RESTRICT secret, size_t secretSize,
				3757	XXH3_f_accumulate_512 f_acc512,
				3758	XXH3_f_scrambleAcc f_scramble)
				3759	{
				3760	XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
				3761
				3762	XXH3_hashLong_internal_loop(acc, (const xxh_u8)input, len, (const xxh_u8)secret, secretSize, f_acc512, f_scramble);
				3763
				3764	/* converge into final hash */
				3765	XXH_STATIC_ASSERT(sizeof(acc) == 64);
				3766	/* do not align on 8, so that the secret is different from the accumulator */
				3767	#define XXH_SECRET_MERGEACCS_START 11
				3768	XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
				3769	return XXH3_mergeAccs(acc, (const xxh_u8)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len XXH_PRIME64_1);
				3770	}
				3771
				3772	/*
				3773	* It's important for performance that XXH3_hashLong is not inlined.
				3774	*/
				3775	XXH_NO_INLINE XXH64_hash_t
				3776	XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
				3777	XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
				3778	{
				3779	(void)seed64;
				3780	return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
				3781	}
				3782
				3783	/*
				3784	* It's important for performance that XXH3_hashLong is not inlined.
				3785	* Since the function is not inlined, the compiler may not be able to understand that,
				3786	* in some scenarios, its `secret` argument is actually a compile time constant.
				3787	* This variant enforces that the compiler can detect that,
				3788	* and uses this opportunity to streamline the generated code for better performance.
				3789	*/
				3790	XXH_NO_INLINE XXH64_hash_t
				3791	XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
				3792	XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
				3793	{
				3794	(void)seed64; (void)secret; (void)secretLen;
				3795	return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
				3796	}
				3797
				3798	/*
				3799	* XXH3_hashLong_64b_withSeed():
				3800	* Generate a custom key based on alteration of default XXH3_kSecret with the seed,
				3801	* and then use this key for long mode hashing.
				3802	*
				3803	* This operation is decently fast but nonetheless costs a little bit of time.
				3804	* Try to avoid it whenever possible (typically when seed==0).
				3805	*
				3806	* It's important for performance that XXH3_hashLong is not inlined. Not sure
				3807	* why (uop cache maybe?), but the difference is large and easily measurable.
				3808	*/
				3809	XXH_FORCE_INLINE XXH64_hash_t
				3810	XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
				3811	XXH64_hash_t seed,
				3812	XXH3_f_accumulate_512 f_acc512,
				3813	XXH3_f_scrambleAcc f_scramble,
				3814	XXH3_f_initCustomSecret f_initSec)
				3815	{
				3816	if (seed == 0)
				3817	return XXH3_hashLong_64b_internal(input, len,
				3818	XXH3_kSecret, sizeof(XXH3_kSecret),
				3819	f_acc512, f_scramble);
				3820	{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
				3821	f_initSec(secret, seed);
				3822	return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
				3823	f_acc512, f_scramble);
				3824	}
				3825	}
				3826
				3827	/*
				3828	* It's important for performance that XXH3_hashLong is not inlined.
				3829	*/
				3830	XXH_NO_INLINE XXH64_hash_t
				3831	XXH3_hashLong_64b_withSeed(const void* input, size_t len,
				3832	XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
				3833	{
				3834	(void)secret; (void)secretLen;
				3835	return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
				3836	XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
				3837	}
				3838
				3839
				3840	typedef XXH64_hash_t (XXH3_hashLong64_f)(const void XXH_RESTRICT, size_t,
				3841	XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
				3842
				3843	XXH_FORCE_INLINE XXH64_hash_t
				3844	XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
				3845	XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
				3846	XXH3_hashLong64_f f_hashLong)
				3847	{
				3848	XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
				3849	/*
				3850	* If an action is to be taken if `secretLen` condition is not respected,
				3851	* it should be done here.
				3852	* For now, it's a contract pre-condition.
				3853	* Adding a check and a branch here would cost performance at every hash.
				3854	* Also, note that function signature doesn't offer room to return an error.
				3855	*/
				3856	if (len <= 16)
				3857	return XXH3_len_0to16_64b((const xxh_u8)input, len, (const xxh_u8)secret, seed64);
				3858	if (len <= 128)
				3859	return XXH3_len_17to128_64b((const xxh_u8)input, len, (const xxh_u8)secret, secretLen, seed64);
				3860	if (len <= XXH3_MIDSIZE_MAX)
				3861	return XXH3_len_129to240_64b((const xxh_u8)input, len, (const xxh_u8)secret, secretLen, seed64);
				3862	return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
				3863	}
				3864
				3865
				3866	/* === Public entry point === */
				3867
				3868	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
				3869	{
				3870	return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
				3871	}
				3872
				3873	XXH_PUBLIC_API XXH64_hash_t
				3874	XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
				3875	{
				3876	return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
				3877	}
				3878
				3879	XXH_PUBLIC_API XXH64_hash_t
				3880	XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
				3881	{
				3882	return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
				3883	}
				3884
				3885
				3886	/* === XXH3 streaming === */
				3887
				3888	/*
				3889	* Malloc's a pointer that is always aligned to align.
				3890	*
				3891	* This must be freed with `XXH_alignedFree()`.
				3892	*
				3893	* malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
				3894	* alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
				3895	* or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
				3896	*
				3897	* This underalignment previously caused a rather obvious crash which went
				3898	* completely unnoticed due to XXH3_createState() not actually being tested.
				3899	* Credit to RedSpah for noticing this bug.
				3900	*
				3901	* The alignment is done manually: Functions like posix_memalign or _mm_malloc
				3902	* are avoided: To maintain portability, we would have to write a fallback
				3903	* like this anyways, and besides, testing for the existence of library
				3904	* functions without relying on external build tools is impossible.
				3905	*
				3906	* The method is simple: Overallocate, manually align, and store the offset
				3907	* to the original behind the returned pointer.
				3908	*
				3909	* Align must be a power of 2 and 8 <= align <= 128.
				3910	*/
				3911	static void* XXH_alignedMalloc(size_t s, size_t align)
				3912	{
				3913	XXH_ASSERT(align <= 128 && align >= 8); /* range check */
				3914	XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */
				3915	XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */
				3916	{ /* Overallocate to make room for manual realignment and an offset byte */
				3917	xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
				3918	if (base != NULL) {
				3919	/*
				3920	* Get the offset needed to align this pointer.
				3921	*
				3922	* Even if the returned pointer is aligned, there will always be
				3923	* at least one byte to store the offset to the original pointer.
				3924	*/
				3925	size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
				3926	/* Add the offset for the now-aligned pointer */
				3927	xxh_u8* ptr = base + offset;
				3928
				3929	XXH_ASSERT((size_t)ptr % align == 0);
				3930
				3931	/* Store the offset immediately before the returned pointer. */
				3932	ptr[-1] = (xxh_u8)offset;
				3933	return ptr;
				3934	}
				3935	return NULL;
				3936	}
				3937	}
				3938	/*
				3939	* Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
				3940	* normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
				3941	*/
				3942	static void XXH_alignedFree(void* p)
				3943	{
				3944	if (p != NULL) {
				3945	xxh_u8* ptr = (xxh_u8*)p;
				3946	/* Get the offset byte we added in XXH_malloc. */
				3947	xxh_u8 offset = ptr[-1];
				3948	/* Free the original malloc'd pointer */
				3949	xxh_u8* base = ptr - offset;
				3950	XXH_free(base);
				3951	}
				3952	}
				3953	XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
				3954	{
				3955	XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
				3956	if (state==NULL) return NULL;
				3957	XXH3_INITSTATE(state);
				3958	return state;
				3959	}
				3960
				3961	XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
				3962	{
				3963	XXH_alignedFree(statePtr);
				3964	return XXH_OK;
				3965	}
				3966
				3967	XXH_PUBLIC_API void
				3968	XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
				3969	{
				3970	memcpy(dst_state, src_state, sizeof(*dst_state));
				3971	}
				3972
				3973	static void
				3974	XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
				3975	XXH64_hash_t seed,
				3976	const void* secret, size_t secretSize)
				3977	{
				3978	size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
				3979	size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
				3980	XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
				3981	XXH_ASSERT(statePtr != NULL);
				3982	/* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
				3983	memset((char*)statePtr + initStart, 0, initLength);
				3984	statePtr->acc[0] = XXH_PRIME32_3;
				3985	statePtr->acc[1] = XXH_PRIME64_1;
				3986	statePtr->acc[2] = XXH_PRIME64_2;
				3987	statePtr->acc[3] = XXH_PRIME64_3;
				3988	statePtr->acc[4] = XXH_PRIME64_4;
				3989	statePtr->acc[5] = XXH_PRIME32_2;
				3990	statePtr->acc[6] = XXH_PRIME64_5;
				3991	statePtr->acc[7] = XXH_PRIME32_1;
				3992	statePtr->seed = seed;
				3993	statePtr->extSecret = (const unsigned char*)secret;
				3994	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
				3995	statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
				3996	statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
				3997	}
				3998
				3999	XXH_PUBLIC_API XXH_errorcode
				4000	XXH3_64bits_reset(XXH3_state_t* statePtr)
				4001	{
				4002	if (statePtr == NULL) return XXH_ERROR;
				4003	XXH3_64bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
				4004	return XXH_OK;
				4005	}
				4006
				4007	XXH_PUBLIC_API XXH_errorcode
				4008	XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
				4009	{
				4010	if (statePtr == NULL) return XXH_ERROR;
				4011	XXH3_64bits_reset_internal(statePtr, 0, secret, secretSize);
				4012	if (secret == NULL) return XXH_ERROR;
				4013	if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
				4014	return XXH_OK;
				4015	}
				4016
				4017	XXH_PUBLIC_API XXH_errorcode
				4018	XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
				4019	{
				4020	if (statePtr == NULL) return XXH_ERROR;
				4021	if (seed==0) return XXH3_64bits_reset(statePtr);
				4022	if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
				4023	XXH3_64bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
				4024	return XXH_OK;
				4025	}
				4026
				4027	/* Note : when XXH3_consumeStripes() is invoked,
				4028	* there must be a guarantee that at least one more byte must be consumed from input
				4029	* so that the function can blindly consume all stripes using the "normal" secret segment */
				4030	XXH_FORCE_INLINE void
				4031	XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
				4032	size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
				4033	const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
				4034	const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
				4035	XXH3_f_accumulate_512 f_acc512,
				4036	XXH3_f_scrambleAcc f_scramble)
				4037	{
				4038	XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */
				4039	XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
				4040	if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
				4041	/* need a scrambling operation */
				4042	size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
				4043	size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
				4044	XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
				4045	f_scramble(acc, secret + secretLimit);
				4046	XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
				4047	*nbStripesSoFarPtr = nbStripesAfterBlock;
				4048	} else {
				4049	XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
				4050	*nbStripesSoFarPtr += nbStripes;
				4051	}
				4052	}
				4053
				4054	/*
				4055	* Both XXH3_64bits_update and XXH3_128bits_update use this routine.
				4056	*/
				4057	XXH_FORCE_INLINE XXH_errorcode
				4058	XXH3_update(XXH3_state_t* state,
				4059	const xxh_u8* input, size_t len,
				4060	XXH3_f_accumulate_512 f_acc512,
				4061	XXH3_f_scrambleAcc f_scramble)
				4062	{
				4063	if (input==NULL)
				4064	#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
				4065	return XXH_OK;
				4066	#else
				4067	return XXH_ERROR;
				4068	#endif
				4069
				4070	{ const xxh_u8* const bEnd = input + len;
				4071	const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
				4072
				4073	state->totalLen += len;
				4074
				4075	if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
				4076	XXH_memcpy(state->buffer + state->bufferedSize, input, len);
				4077	state->bufferedSize += (XXH32_hash_t)len;
				4078	return XXH_OK;
				4079	}
				4080	/* total input is now > XXH3_INTERNALBUFFER_SIZE */
				4081
				4082	#define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
				4083	XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */
				4084
				4085	/*
				4086	* Internal buffer is partially filled (always, except at beginning)
				4087	* Complete it, then consume it.
				4088	*/
				4089	if (state->bufferedSize) {
				4090	size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
				4091	XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
				4092	input += loadSize;
				4093	XXH3_consumeStripes(state->acc,
				4094	&state->nbStripesSoFar, state->nbStripesPerBlock,
				4095	state->buffer, XXH3_INTERNALBUFFER_STRIPES,
				4096	secret, state->secretLimit,
				4097	f_acc512, f_scramble);
				4098	state->bufferedSize = 0;
				4099	}
				4100	XXH_ASSERT(input < bEnd);
				4101
				4102	/* Consume input by a multiple of internal buffer size */
				4103	if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) {
				4104	const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
				4105	do {
				4106	XXH3_consumeStripes(state->acc,
				4107	&state->nbStripesSoFar, state->nbStripesPerBlock,
				4108	input, XXH3_INTERNALBUFFER_STRIPES,
				4109	secret, state->secretLimit,
				4110	f_acc512, f_scramble);
				4111	input += XXH3_INTERNALBUFFER_SIZE;
				4112	} while (input<limit);
				4113	/* for last partial stripe */
				4114	memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
				4115	}
				4116	XXH_ASSERT(input < bEnd);
				4117
				4118	/* Some remaining input (always) : buffer it */
				4119	XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
				4120	state->bufferedSize = (XXH32_hash_t)(bEnd-input);
				4121	}
				4122
				4123	return XXH_OK;
				4124	}
				4125
				4126	XXH_PUBLIC_API XXH_errorcode
				4127	XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
				4128	{
				4129	return XXH3_update(state, (const xxh_u8*)input, len,
				4130	XXH3_accumulate_512, XXH3_scrambleAcc);
				4131	}
				4132
				4133
				4134	XXH_FORCE_INLINE void
				4135	XXH3_digest_long (XXH64_hash_t* acc,
				4136	const XXH3_state_t* state,
				4137	const unsigned char* secret)
				4138	{
				4139	/*
				4140	* Digest on a local copy. This way, the state remains unaltered, and it can
				4141	* continue ingesting more input afterwards.
				4142	*/
				4143	memcpy(acc, state->acc, sizeof(state->acc));
				4144	if (state->bufferedSize >= XXH_STRIPE_LEN) {
				4145	size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
				4146	size_t nbStripesSoFar = state->nbStripesSoFar;
				4147	XXH3_consumeStripes(acc,
				4148	&nbStripesSoFar, state->nbStripesPerBlock,
				4149	state->buffer, nbStripes,
				4150	secret, state->secretLimit,
				4151	XXH3_accumulate_512, XXH3_scrambleAcc);
				4152	/* last stripe */
				4153	XXH3_accumulate_512(acc,
				4154	state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
				4155	secret + state->secretLimit - XXH_SECRET_LASTACC_START);
				4156	} else { /* bufferedSize < XXH_STRIPE_LEN */
				4157	xxh_u8 lastStripe[XXH_STRIPE_LEN];
				4158	size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
				4159	XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
				4160	memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
				4161	memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
				4162	XXH3_accumulate_512(acc,
				4163	lastStripe,
				4164	secret + state->secretLimit - XXH_SECRET_LASTACC_START);
				4165	}
				4166	}
				4167
				4168	XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
				4169	{
				4170	const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
				4171	if (state->totalLen > XXH3_MIDSIZE_MAX) {
				4172	XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
				4173	XXH3_digest_long(acc, state, secret);
				4174	return XXH3_mergeAccs(acc,
				4175	secret + XXH_SECRET_MERGEACCS_START,
				4176	(xxh_u64)state->totalLen * XXH_PRIME64_1);
				4177	}
				4178	/* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
				4179	if (state->seed)
				4180	return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
				4181	return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
				4182	secret, state->secretLimit + XXH_STRIPE_LEN);
				4183	}
				4184
				4185
				4186	#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
				4187
				4188	XXH_PUBLIC_API void
				4189	XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)
				4190	{
				4191	XXH_ASSERT(secretBuffer != NULL);
				4192	if (customSeedSize == 0) {
				4193	memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
				4194	return;
				4195	}
				4196	XXH_ASSERT(customSeed != NULL);
				4197
				4198	{ size_t const segmentSize = sizeof(XXH128_hash_t);
				4199	size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
				4200	XXH128_canonical_t scrambler;
				4201	XXH64_hash_t seeds[12];
				4202	size_t segnb;
				4203	XXH_ASSERT(nbSegments == 12);
				4204	XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
				4205	XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
				4206
				4207	/*
				4208	* Copy customSeed to seeds[], truncating or repeating as necessary.
				4209	*/
				4210	{ size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
				4211	size_t filled = toFill;
				4212	memcpy(seeds, customSeed, toFill);
				4213	while (filled < sizeof(seeds)) {
				4214	toFill = XXH_MIN(filled, sizeof(seeds) - filled);
				4215	memcpy((char*)seeds + filled, seeds, toFill);
				4216	filled += toFill;
				4217	} }
				4218
				4219	/* generate secret */
				4220	memcpy(secretBuffer, &scrambler, sizeof(scrambler));
				4221	for (segnb=1; segnb < nbSegments; segnb++) {
				4222	size_t const segmentStart = segnb * segmentSize;
				4223	XXH128_canonical_t segment;
				4224	XXH128_canonicalFromHash(&segment,
				4225	XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );
				4226	memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));
				4227	} }
				4228	}
				4229
				4230
				4231	/* ==========================================
				4232	* XXH3 128 bits (a.k.a XXH128)
				4233	* ==========================================
				4234	* XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
				4235	* even without counting the significantly larger output size.
				4236	*
				4237	* For example, extra steps are taken to avoid the seed-dependent collisions
				4238	* in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
				4239	*
				4240	* This strength naturally comes at the cost of some speed, especially on short
				4241	* lengths. Note that longer hashes are about as fast as the 64-bit version
				4242	* due to it using only a slight modification of the 64-bit loop.
				4243	*
				4244	* XXH128 is also more oriented towards 64-bit machines. It is still extremely
				4245	* fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
				4246	*/
				4247
				4248	XXH_FORCE_INLINE XXH128_hash_t
				4249	XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				4250	{
				4251	/* A doubled version of 1to3_64b with different constants. */
				4252	XXH_ASSERT(input != NULL);
				4253	XXH_ASSERT(1 <= len && len <= 3);
				4254	XXH_ASSERT(secret != NULL);
				4255	/*
				4256	* len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
				4257	* len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
				4258	* len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
				4259	*/
				4260	{ xxh_u8 const c1 = input[0];
				4261	xxh_u8 const c2 = input[len >> 1];
				4262	xxh_u8 const c3 = input[len - 1];
				4263	xxh_u32 const combinedl = ((xxh_u32)c1 <<16) \| ((xxh_u32)c2 << 24)
				4264	\| ((xxh_u32)c3 << 0) \| ((xxh_u32)len << 8);
				4265	xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
				4266	xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
				4267	xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
				4268	xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
				4269	xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
				4270	XXH128_hash_t h128;
				4271	h128.low64 = XXH64_avalanche(keyed_lo);
				4272	h128.high64 = XXH64_avalanche(keyed_hi);
				4273	return h128;
				4274	}
				4275	}
				4276
				4277	XXH_FORCE_INLINE XXH128_hash_t
				4278	XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				4279	{
				4280	XXH_ASSERT(input != NULL);
				4281	XXH_ASSERT(secret != NULL);
				4282	XXH_ASSERT(4 <= len && len <= 8);
				4283	seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
				4284	{ xxh_u32 const input_lo = XXH_readLE32(input);
				4285	xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
				4286	xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
				4287	xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
				4288	xxh_u64 const keyed = input_64 ^ bitflip;
				4289
				4290	/* Shift len to the left to ensure it is even, this avoids even multiplies. */
				4291	XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
				4292
				4293	m128.high64 += (m128.low64 << 1);
				4294	m128.low64 ^= (m128.high64 >> 3);
				4295
				4296	m128.low64 = XXH_xorshift64(m128.low64, 35);
				4297	m128.low64 *= 0x9FB21C651E98DF25ULL;
				4298	m128.low64 = XXH_xorshift64(m128.low64, 28);
				4299	m128.high64 = XXH3_avalanche(m128.high64);
				4300	return m128;
				4301	}
				4302	}
				4303
				4304	XXH_FORCE_INLINE XXH128_hash_t
				4305	XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				4306	{
				4307	XXH_ASSERT(input != NULL);
				4308	XXH_ASSERT(secret != NULL);
				4309	XXH_ASSERT(9 <= len && len <= 16);
				4310	{ xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
				4311	xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
				4312	xxh_u64 const input_lo = XXH_readLE64(input);
				4313	xxh_u64 input_hi = XXH_readLE64(input + len - 8);
				4314	XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
				4315	/*
				4316	* Put len in the middle of m128 to ensure that the length gets mixed to
				4317	* both the low and high bits in the 128x64 multiply below.
				4318	*/
				4319	m128.low64 += (xxh_u64)(len - 1) << 54;
				4320	input_hi ^= bitfliph;
				4321	/*
				4322	* Add the high 32 bits of input_hi to the high 32 bits of m128, then
				4323	* add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
				4324	* the high 64 bits of m128.
				4325	*
				4326	* The best approach to this operation is different on 32-bit and 64-bit.
				4327	*/
				4328	if (sizeof(void ) < sizeof(xxh_u64)) { / 32-bit */
				4329	/*
				4330	* 32-bit optimized version, which is more readable.
				4331	*
				4332	* On 32-bit, it removes an ADC and delays a dependency between the two
				4333	* halves of m128.high64, but it generates an extra mask on 64-bit.
				4334	*/
				4335	m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
				4336	} else {
				4337	/*
				4338	* 64-bit optimized (albeit more confusing) version.
				4339	*
				4340	* Uses some properties of addition and multiplication to remove the mask:
				4341	*
				4342	* Let:
				4343	* a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
				4344	* b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
				4345	* c = XXH_PRIME32_2
				4346	*
				4347	* a + (b * c)
				4348	* Inverse Property: x + y - x == y
				4349	* a + (b * (1 + c - 1))
				4350	* Distributive Property: x * (y + z) == (x * y) + (x * z)
				4351	* a + (b * 1) + (b * (c - 1))
				4352	* Identity Property: x * 1 == x
				4353	* a + b + (b * (c - 1))
				4354	*
				4355	* Substitute a, b, and c:
				4356	* input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
				4357	*
				4358	* Since input_hi.hi + input_hi.lo == input_hi, we get this:
				4359	* input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
				4360	*/
				4361	m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
				4362	}
				4363	/* m128 ^= XXH_swap64(m128 >> 64); */
				4364	m128.low64 ^= XXH_swap64(m128.high64);
				4365
				4366	{ /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
				4367	XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
				4368	h128.high64 += m128.high64 * XXH_PRIME64_2;
				4369
				4370	h128.low64 = XXH3_avalanche(h128.low64);
				4371	h128.high64 = XXH3_avalanche(h128.high64);
				4372	return h128;
				4373	} }
				4374	}
				4375
				4376	/*
				4377	* Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
				4378	*/
				4379	XXH_FORCE_INLINE XXH128_hash_t
				4380	XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
				4381	{
				4382	XXH_ASSERT(len <= 16);
				4383	{ if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
				4384	if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
				4385	if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
				4386	{ XXH128_hash_t h128;
				4387	xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
				4388	xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
				4389	h128.low64 = XXH64_avalanche(seed ^ bitflipl);
				4390	h128.high64 = XXH64_avalanche( seed ^ bitfliph);
				4391	return h128;
				4392	} }
				4393	}
				4394
				4395	/*
				4396	* A bit slower than XXH3_mix16B, but handles multiply by zero better.
				4397	*/
				4398	XXH_FORCE_INLINE XXH128_hash_t
				4399	XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
				4400	const xxh_u8* secret, XXH64_hash_t seed)
				4401	{
				4402	acc.low64 += XXH3_mix16B (input_1, secret+0, seed);
				4403	acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
				4404	acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
				4405	acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
				4406	return acc;
				4407	}
				4408
				4409
				4410	XXH_FORCE_INLINE XXH128_hash_t
				4411	XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
				4412	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				4413	XXH64_hash_t seed)
				4414	{
				4415	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
				4416	XXH_ASSERT(16 < len && len <= 128);
				4417
				4418	{ XXH128_hash_t acc;
				4419	acc.low64 = len * XXH_PRIME64_1;
				4420	acc.high64 = 0;
				4421	if (len > 32) {
				4422	if (len > 64) {
				4423	if (len > 96) {
				4424	acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
				4425	}
				4426	acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
				4427	}
				4428	acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
				4429	}
				4430	acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
				4431	{ XXH128_hash_t h128;
				4432	h128.low64 = acc.low64 + acc.high64;
				4433	h128.high64 = (acc.low64 * XXH_PRIME64_1)
				4434	+ (acc.high64 * XXH_PRIME64_4)
				4435	+ ((len - seed) * XXH_PRIME64_2);
				4436	h128.low64 = XXH3_avalanche(h128.low64);
				4437	h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
				4438	return h128;
				4439	}
				4440	}
				4441	}
				4442
				4443	XXH_NO_INLINE XXH128_hash_t
				4444	XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
				4445	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				4446	XXH64_hash_t seed)
				4447	{
				4448	XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
				4449	XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
				4450
				4451	{ XXH128_hash_t acc;
				4452	int const nbRounds = (int)len / 32;
				4453	int i;
				4454	acc.low64 = len * XXH_PRIME64_1;
				4455	acc.high64 = 0;
				4456	for (i=0; i<4; i++) {
				4457	acc = XXH128_mix32B(acc,
				4458	input + (32 * i),
				4459	input + (32 * i) + 16,
				4460	secret + (32 * i),
				4461	seed);
				4462	}
				4463	acc.low64 = XXH3_avalanche(acc.low64);
				4464	acc.high64 = XXH3_avalanche(acc.high64);
				4465	XXH_ASSERT(nbRounds >= 4);
				4466	for (i=4 ; i < nbRounds; i++) {
				4467	acc = XXH128_mix32B(acc,
				4468	input + (32 * i),
				4469	input + (32 * i) + 16,
				4470	secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
				4471	seed);
				4472	}
				4473	/* last bytes */
				4474	acc = XXH128_mix32B(acc,
				4475	input + len - 16,
				4476	input + len - 32,
				4477	secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
				4478	0ULL - seed);
				4479
				4480	{ XXH128_hash_t h128;
				4481	h128.low64 = acc.low64 + acc.high64;
				4482	h128.high64 = (acc.low64 * XXH_PRIME64_1)
				4483	+ (acc.high64 * XXH_PRIME64_4)
				4484	+ ((len - seed) * XXH_PRIME64_2);
				4485	h128.low64 = XXH3_avalanche(h128.low64);
				4486	h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
				4487	return h128;
				4488	}
				4489	}
				4490	}
				4491
				4492	XXH_FORCE_INLINE XXH128_hash_t
				4493	XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
				4494	const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
				4495	XXH3_f_accumulate_512 f_acc512,
				4496	XXH3_f_scrambleAcc f_scramble)
				4497	{
				4498	XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
				4499
				4500	XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
				4501
				4502	/* converge into final hash */
				4503	XXH_STATIC_ASSERT(sizeof(acc) == 64);
				4504	XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
				4505	{ XXH128_hash_t h128;
				4506	h128.low64 = XXH3_mergeAccs(acc,
				4507	secret + XXH_SECRET_MERGEACCS_START,
				4508	(xxh_u64)len * XXH_PRIME64_1);
				4509	h128.high64 = XXH3_mergeAccs(acc,
				4510	secret + secretSize
				4511	- sizeof(acc) - XXH_SECRET_MERGEACCS_START,
				4512	~((xxh_u64)len * XXH_PRIME64_2));
				4513	return h128;
				4514	}
				4515	}
				4516
				4517	/*
				4518	* It's important for performance that XXH3_hashLong is not inlined.
				4519	*/
				4520	XXH_NO_INLINE XXH128_hash_t
				4521	XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
				4522	XXH64_hash_t seed64,
				4523	const void* XXH_RESTRICT secret, size_t secretLen)
				4524	{
				4525	(void)seed64; (void)secret; (void)secretLen;
				4526	return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
				4527	XXH3_accumulate_512, XXH3_scrambleAcc);
				4528	}
				4529
				4530	/*
				4531	* It's important for performance that XXH3_hashLong is not inlined.
				4532	*/
				4533	XXH_NO_INLINE XXH128_hash_t
				4534	XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
				4535	XXH64_hash_t seed64,
				4536	const void* XXH_RESTRICT secret, size_t secretLen)
				4537	{
				4538	(void)seed64;
				4539	return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
				4540	XXH3_accumulate_512, XXH3_scrambleAcc);
				4541	}
				4542
				4543	XXH_FORCE_INLINE XXH128_hash_t
				4544	XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
				4545	XXH64_hash_t seed64,
				4546	XXH3_f_accumulate_512 f_acc512,
				4547	XXH3_f_scrambleAcc f_scramble,
				4548	XXH3_f_initCustomSecret f_initSec)
				4549	{
				4550	if (seed64 == 0)
				4551	return XXH3_hashLong_128b_internal(input, len,
				4552	XXH3_kSecret, sizeof(XXH3_kSecret),
				4553	f_acc512, f_scramble);
				4554	{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
				4555	f_initSec(secret, seed64);
				4556	return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
				4557	f_acc512, f_scramble);
				4558	}
				4559	}
				4560
				4561	/*
				4562	* It's important for performance that XXH3_hashLong is not inlined.
				4563	*/
				4564	XXH_NO_INLINE XXH128_hash_t
				4565	XXH3_hashLong_128b_withSeed(const void* input, size_t len,
				4566	XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
				4567	{
				4568	(void)secret; (void)secretLen;
				4569	return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
				4570	XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
				4571	}
				4572
				4573	typedef XXH128_hash_t (XXH3_hashLong128_f)(const void XXH_RESTRICT, size_t,
				4574	XXH64_hash_t, const void* XXH_RESTRICT, size_t);
				4575
				4576	XXH_FORCE_INLINE XXH128_hash_t
				4577	XXH3_128bits_internal(const void* input, size_t len,
				4578	XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
				4579	XXH3_hashLong128_f f_hl128)
				4580	{
				4581	XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
				4582	/*
				4583	* If an action is to be taken if `secret` conditions are not respected,
				4584	* it should be done here.
				4585	* For now, it's a contract pre-condition.
				4586	* Adding a check and a branch here would cost performance at every hash.
				4587	*/
				4588	if (len <= 16)
				4589	return XXH3_len_0to16_128b((const xxh_u8)input, len, (const xxh_u8)secret, seed64);
				4590	if (len <= 128)
				4591	return XXH3_len_17to128_128b((const xxh_u8)input, len, (const xxh_u8)secret, secretLen, seed64);
				4592	if (len <= XXH3_MIDSIZE_MAX)
				4593	return XXH3_len_129to240_128b((const xxh_u8)input, len, (const xxh_u8)secret, secretLen, seed64);
				4594	return f_hl128(input, len, seed64, secret, secretLen);
				4595	}
				4596
				4597
				4598	/* === Public XXH128 API === */
				4599
				4600	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
				4601	{
				4602	return XXH3_128bits_internal(input, len, 0,
				4603	XXH3_kSecret, sizeof(XXH3_kSecret),
				4604	XXH3_hashLong_128b_default);
				4605	}
				4606
				4607	XXH_PUBLIC_API XXH128_hash_t
				4608	XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
				4609	{
				4610	return XXH3_128bits_internal(input, len, 0,
				4611	(const xxh_u8*)secret, secretSize,
				4612	XXH3_hashLong_128b_withSecret);
				4613	}
				4614
				4615	XXH_PUBLIC_API XXH128_hash_t
				4616	XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
				4617	{
				4618	return XXH3_128bits_internal(input, len, seed,
				4619	XXH3_kSecret, sizeof(XXH3_kSecret),
				4620	XXH3_hashLong_128b_withSeed);
				4621	}
				4622
				4623	XXH_PUBLIC_API XXH128_hash_t
				4624	XXH128(const void* input, size_t len, XXH64_hash_t seed)
				4625	{
				4626	return XXH3_128bits_withSeed(input, len, seed);
				4627	}
				4628
				4629
				4630	/* === XXH3 128-bit streaming === */
				4631
				4632	/*
				4633	* All the functions are actually the same as for 64-bit streaming variant.
				4634	* The only difference is the finalizatiom routine.
				4635	*/
				4636
				4637	static void
				4638	XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
				4639	XXH64_hash_t seed,
				4640	const void* secret, size_t secretSize)
				4641	{
				4642	XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
				4643	}
				4644
				4645	XXH_PUBLIC_API XXH_errorcode
				4646	XXH3_128bits_reset(XXH3_state_t* statePtr)
				4647	{
				4648	if (statePtr == NULL) return XXH_ERROR;
				4649	XXH3_128bits_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
				4650	return XXH_OK;
				4651	}
				4652
				4653	XXH_PUBLIC_API XXH_errorcode
				4654	XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
				4655	{
				4656	if (statePtr == NULL) return XXH_ERROR;
				4657	XXH3_128bits_reset_internal(statePtr, 0, secret, secretSize);
				4658	if (secret == NULL) return XXH_ERROR;
				4659	if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
				4660	return XXH_OK;
				4661	}
				4662
				4663	XXH_PUBLIC_API XXH_errorcode
				4664	XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
				4665	{
				4666	if (statePtr == NULL) return XXH_ERROR;
				4667	if (seed==0) return XXH3_128bits_reset(statePtr);
				4668	if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
				4669	XXH3_128bits_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
				4670	return XXH_OK;
				4671	}
				4672
				4673	XXH_PUBLIC_API XXH_errorcode
				4674	XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
				4675	{
				4676	return XXH3_update(state, (const xxh_u8*)input, len,
				4677	XXH3_accumulate_512, XXH3_scrambleAcc);
				4678	}
				4679
				4680	XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
				4681	{
				4682	const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
				4683	if (state->totalLen > XXH3_MIDSIZE_MAX) {
				4684	XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
				4685	XXH3_digest_long(acc, state, secret);
				4686	XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
				4687	{ XXH128_hash_t h128;
				4688	h128.low64 = XXH3_mergeAccs(acc,
				4689	secret + XXH_SECRET_MERGEACCS_START,
				4690	(xxh_u64)state->totalLen * XXH_PRIME64_1);
				4691	h128.high64 = XXH3_mergeAccs(acc,
				4692	secret + state->secretLimit + XXH_STRIPE_LEN
				4693	- sizeof(acc) - XXH_SECRET_MERGEACCS_START,
				4694	~((xxh_u64)state->totalLen * XXH_PRIME64_2));
				4695	return h128;
				4696	}
				4697	}
				4698	/* len <= XXH3_MIDSIZE_MAX : short code */
				4699	if (state->seed)
				4700	return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
				4701	return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
				4702	secret, state->secretLimit + XXH_STRIPE_LEN);
				4703	}
				4704
				4705	/* 128-bit utility functions */
				4706
				4707	#include <string.h> /* memcmp, memcpy */
				4708
				4709	/* return : 1 is equal, 0 if different */
				4710	XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
				4711	{
				4712	/* note : XXH128_hash_t is compact, it has no padding byte */
				4713	return !(memcmp(&h1, &h2, sizeof(h1)));
				4714	}
				4715
				4716	/* This prototype is compatible with stdlib's qsort().
				4717	* return : >0 if h128_1 > h128_2
				4718	* <0 if h128_1 < h128_2
				4719	* =0 if h128_1 == h128_2 */
				4720	XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
				4721	{
				4722	XXH128_hash_t const h1 = (const XXH128_hash_t)h128_1;
				4723	XXH128_hash_t const h2 = (const XXH128_hash_t)h128_2;
				4724	int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
				4725	/* note : bets that, in most cases, hash values are different */
				4726	if (hcmp) return hcmp;
				4727	return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
				4728	}
				4729
				4730
				4731	/====== Canonical representation ======/
				4732	XXH_PUBLIC_API void
				4733	XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
				4734	{
				4735	XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
				4736	if (XXH_CPU_LITTLE_ENDIAN) {
				4737	hash.high64 = XXH_swap64(hash.high64);
				4738	hash.low64 = XXH_swap64(hash.low64);
				4739	}
				4740	memcpy(dst, &hash.high64, sizeof(hash.high64));
				4741	memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
				4742	}
				4743
				4744	XXH_PUBLIC_API XXH128_hash_t
				4745	XXH128_hashFromCanonical(const XXH128_canonical_t* src)
				4746	{
				4747	XXH128_hash_t h;
				4748	h.high64 = XXH_readBE64(src);
				4749	h.low64 = XXH_readBE64(src->digest + 8);
				4750	return h;
				4751	}
				4752
				4753	/* Pop our optimization override from above */
				4754	#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
				4755	&& defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
				4756	&& defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
				4757	# pragma GCC pop_options
				4758	#endif
				4759
				4760	#endif /* XXH_NO_LONG_LONG */
				4761
				4762
				4763	#endif /* XXH_IMPLEMENTATION */
Willy Tarreau	b5684e0	2015-04-27 11:59:40 +0200	[diff] [blame]	4764
				4765
				4766	#if defined (__cplusplus)
				4767	}
				4768	#endif