// Copyright 2016 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef HIGHWAYHASH_VECTOR128_H_ #define HIGHWAYHASH_VECTOR128_H_ // Defines SIMD vector classes ("V2x64U") with overloaded arithmetic operators: // const V2x64U masked_sum = (a + b) & m; // This is shorter and more readable than compiler intrinsics: // const __m128i masked_sum = _mm_and_si128(_mm_add_epi64(a, b), m); // There is typically no runtime cost for these abstractions. // // The naming convention is VNxBBT where N is the number of lanes, BB the // number of bits per lane and T is the lane type: unsigned integer (U), // signed integer (I), or floating-point (F). // WARNING: this is a "restricted" header because it is included from // translation units compiled with different flags. This header and its // dependencies must not define any function unless it is static inline and/or // within namespace HH_TARGET_NAME. See arch_specific.h for details. #include #include #include "highwayhash/arch_specific.h" #include "highwayhash/compiler_specific.h" // For auto-dependency generation, we need to include all headers but not their // contents (otherwise compilation fails because -msse4.1 is not specified). #ifndef HH_DISABLE_TARGET_SPECIFIC // WARNING: smmintrin.h will also be included through immintrin.h in the AVX2 // translation unit, which is compiled with different flags. This risks ODR // violations, and can cause crashes when functions are not inlined and the // linker selects the AVX2 version. Unfortunately this include cannot reside // within a namespace due to conflicts with other system headers. We need to // assume all the intrinsic functions (defined as static inline by Clang's // library and as extern inline by GCC) are in fact inlined. targets.bzl // generates a test that verifies this by detecting duplicate symbols. #include // SSE4.1 namespace highwayhash { // To prevent ODR violations when including this from multiple translation // units (TU) that are compiled with different flags, the contents must reside // in a namespace whose name is unique to the TU. NOTE: this behavior is // incompatible with precompiled modules and requires textual inclusion instead. namespace HH_TARGET_NAME { // Primary template for 128-bit SSE4.1 vectors; only specializations are used. template class V128 {}; template <> class V128 { public: using Intrinsic = __m128i; using T = uint8_t; static constexpr size_t N = 16; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Broadcasts i to all lanes (usually by loading from memory). HH_INLINE explicit V128(T i) : v_(_mm_set1_epi8(i)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_(other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } // There are no greater-than comparison instructions for unsigned T. HH_INLINE V128 operator==(const V128& other) const { return V128(_mm_cmpeq_epi8(v_, other.v_)); } HH_INLINE V128& operator+=(const V128& other) { v_ = _mm_add_epi8(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = _mm_sub_epi8(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = _mm_and_si128(v_, other.v_); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = _mm_or_si128(v_, other.v_); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = _mm_xor_si128(v_, other.v_); return *this; } private: Intrinsic v_; }; template <> class V128 { public: using Intrinsic = __m128i; using T = uint16_t; static constexpr size_t N = 8; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Lane 0 (p_0) is the lowest. HH_INLINE V128(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) : v_(_mm_set_epi16(p_7, p_6, p_5, p_4, p_3, p_2, p_1, p_0)) {} // Broadcasts i to all lanes (usually by loading from memory). HH_INLINE explicit V128(T i) : v_(_mm_set1_epi16(i)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_(other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } // There are no greater-than comparison instructions for unsigned T. HH_INLINE V128 operator==(const V128& other) const { return V128(_mm_cmpeq_epi16(v_, other.v_)); } HH_INLINE V128& operator+=(const V128& other) { v_ = _mm_add_epi16(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = _mm_sub_epi16(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = _mm_and_si128(v_, other.v_); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = _mm_or_si128(v_, other.v_); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = _mm_xor_si128(v_, other.v_); return *this; } HH_INLINE V128& operator<<=(const int count) { v_ = _mm_slli_epi16(v_, count); return *this; } HH_INLINE V128& operator<<=(const Intrinsic& count) { v_ = _mm_sll_epi16(v_, count); return *this; } HH_INLINE V128& operator>>=(const int count) { v_ = _mm_srli_epi16(v_, count); return *this; } HH_INLINE V128& operator>>=(const Intrinsic& count) { v_ = _mm_srl_epi16(v_, count); return *this; } private: Intrinsic v_; }; template <> class V128 { public: using Intrinsic = __m128i; using T = uint32_t; static constexpr size_t N = 4; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Lane 0 (p_0) is the lowest. HH_INLINE V128(T p_3, T p_2, T p_1, T p_0) : v_(_mm_set_epi32(p_3, p_2, p_1, p_0)) {} // Broadcasts i to all lanes (usually by loading from memory). HH_INLINE explicit V128(T i) : v_(_mm_set1_epi32(i)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_(other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } // There are no greater-than comparison instructions for unsigned T. HH_INLINE V128 operator==(const V128& other) const { return V128(_mm_cmpeq_epi32(v_, other.v_)); } HH_INLINE V128& operator+=(const V128& other) { v_ = _mm_add_epi32(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = _mm_sub_epi32(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = _mm_and_si128(v_, other.v_); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = _mm_or_si128(v_, other.v_); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = _mm_xor_si128(v_, other.v_); return *this; } HH_INLINE V128& operator<<=(const int count) { v_ = _mm_slli_epi32(v_, count); return *this; } HH_INLINE V128& operator<<=(const Intrinsic& count) { v_ = _mm_sll_epi32(v_, count); return *this; } HH_INLINE V128& operator>>=(const int count) { v_ = _mm_srli_epi32(v_, count); return *this; } HH_INLINE V128& operator>>=(const Intrinsic& count) { v_ = _mm_srl_epi32(v_, count); return *this; } private: Intrinsic v_; }; template <> class V128 { public: using Intrinsic = __m128i; using T = uint64_t; static constexpr size_t N = 2; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Lane 0 (p_0) is the lowest. HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_epi64x(p_1, p_0)) {} // Broadcasts i to all lanes (usually by loading from memory). HH_INLINE explicit V128(T i) : v_(_mm_set_epi64x(i, i)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_(other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } // There are no greater-than comparison instructions for unsigned T. HH_INLINE V128 operator==(const V128& other) const { return V128(_mm_cmpeq_epi64(v_, other.v_)); } HH_INLINE V128& operator+=(const V128& other) { v_ = _mm_add_epi64(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = _mm_sub_epi64(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = _mm_and_si128(v_, other.v_); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = _mm_or_si128(v_, other.v_); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = _mm_xor_si128(v_, other.v_); return *this; } HH_INLINE V128& operator<<=(const int count) { v_ = _mm_slli_epi64(v_, count); return *this; } HH_INLINE V128& operator<<=(const Intrinsic& count) { v_ = _mm_sll_epi64(v_, count); return *this; } HH_INLINE V128& operator>>=(const int count) { v_ = _mm_srli_epi64(v_, count); return *this; } HH_INLINE V128& operator>>=(const Intrinsic& count) { v_ = _mm_srl_epi64(v_, count); return *this; } private: Intrinsic v_; }; template <> class V128 { public: using Intrinsic = __m128; using T = float; static constexpr size_t N = 4; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Lane 0 (p_0) is the lowest. HH_INLINE V128(T p_3, T p_2, T p_1, T p_0) : v_(_mm_set_ps(p_3, p_2, p_1, p_0)) {} // Broadcasts to all lanes. HH_INLINE explicit V128(T f) : v_(_mm_set1_ps(f)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_(other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } HH_INLINE V128 operator==(const V128& other) const { return V128(_mm_cmpeq_ps(v_, other.v_)); } HH_INLINE V128 operator<(const V128& other) const { return V128(_mm_cmplt_ps(v_, other.v_)); } HH_INLINE V128 operator>(const V128& other) const { return V128(_mm_cmplt_ps(other.v_, v_)); } HH_INLINE V128& operator*=(const V128& other) { v_ = _mm_mul_ps(v_, other.v_); return *this; } HH_INLINE V128& operator/=(const V128& other) { v_ = _mm_div_ps(v_, other.v_); return *this; } HH_INLINE V128& operator+=(const V128& other) { v_ = _mm_add_ps(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = _mm_sub_ps(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = _mm_and_ps(v_, other.v_); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = _mm_or_ps(v_, other.v_); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = _mm_xor_ps(v_, other.v_); return *this; } private: Intrinsic v_; }; template <> class V128 { public: using Intrinsic = __m128d; using T = double; static constexpr size_t N = 2; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Lane 0 (p_0) is the lowest. HH_INLINE V128(T p_1, T p_0) : v_(_mm_set_pd(p_1, p_0)) {} // Broadcasts to all lanes. HH_INLINE explicit V128(T f) : v_(_mm_set1_pd(f)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_(other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } HH_INLINE V128 operator==(const V128& other) const { return V128(_mm_cmpeq_pd(v_, other.v_)); } HH_INLINE V128 operator<(const V128& other) const { return V128(_mm_cmplt_pd(v_, other.v_)); } HH_INLINE V128 operator>(const V128& other) const { return V128(_mm_cmplt_pd(other.v_, v_)); } HH_INLINE V128& operator*=(const V128& other) { v_ = _mm_mul_pd(v_, other.v_); return *this; } HH_INLINE V128& operator/=(const V128& other) { v_ = _mm_div_pd(v_, other.v_); return *this; } HH_INLINE V128& operator+=(const V128& other) { v_ = _mm_add_pd(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = _mm_sub_pd(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = _mm_and_pd(v_, other.v_); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = _mm_or_pd(v_, other.v_); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = _mm_xor_pd(v_, other.v_); return *this; } private: Intrinsic v_; }; // Nonmember functions for any V128 via member functions. template HH_INLINE V128 operator*(const V128& left, const V128& right) { V128 t(left); return t *= right; } template HH_INLINE V128 operator/(const V128& left, const V128& right) { V128 t(left); return t /= right; } template HH_INLINE V128 operator+(const V128& left, const V128& right) { V128 t(left); return t += right; } template HH_INLINE V128 operator-(const V128& left, const V128& right) { V128 t(left); return t -= right; } template HH_INLINE V128 operator&(const V128& left, const V128& right) { V128 t(left); return t &= right; } template HH_INLINE V128 operator|(const V128& left, const V128& right) { V128 t(left); return t |= right; } template HH_INLINE V128 operator^(const V128& left, const V128& right) { V128 t(left); return t ^= right; } template HH_INLINE V128 operator<<(const V128& v, const int count) { V128 t(v); return t <<= count; } template HH_INLINE V128 operator>>(const V128& v, const int count) { V128 t(v); return t >>= count; } template HH_INLINE V128 operator<<(const V128& v, const __m128i& count) { V128 t(v); return t <<= count; } template HH_INLINE V128 operator>>(const V128& v, const __m128i& count) { V128 t(v); return t >>= count; } using V16x8U = V128; using V8x16U = V128; using V4x32U = V128; using V2x64U = V128; using V4x32F = V128; using V2x64F = V128; // Load/Store for any V128. // We differentiate between targets' vector types via template specialization. // Calling Load(floats) is more natural than Load(V8x32F(), floats) and may // generate better code in unoptimized builds. Only declare the primary // templates to avoid needing mutual exclusion with vector256. template HH_INLINE V Load(const typename V::T* const HH_RESTRICT from); template HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from); // "from" must be vector-aligned. template <> HH_INLINE V16x8U Load(const V16x8U::T* const HH_RESTRICT from) { const __m128i* const HH_RESTRICT p = reinterpret_cast(from); return V16x8U(_mm_load_si128(p)); } template <> HH_INLINE V8x16U Load(const V8x16U::T* const HH_RESTRICT from) { const __m128i* const HH_RESTRICT p = reinterpret_cast(from); return V8x16U(_mm_load_si128(p)); } template <> HH_INLINE V4x32U Load(const V4x32U::T* const HH_RESTRICT from) { const __m128i* const HH_RESTRICT p = reinterpret_cast(from); return V4x32U(_mm_load_si128(p)); } template <> HH_INLINE V2x64U Load(const V2x64U::T* const HH_RESTRICT from) { const __m128i* const HH_RESTRICT p = reinterpret_cast(from); return V2x64U(_mm_load_si128(p)); } template <> HH_INLINE V4x32F Load(const V4x32F::T* const HH_RESTRICT from) { return V4x32F(_mm_load_ps(from)); } template <> HH_INLINE V2x64F Load(const V2x64F::T* const HH_RESTRICT from) { return V2x64F(_mm_load_pd(from)); } template <> HH_INLINE V16x8U LoadUnaligned(const V16x8U::T* const HH_RESTRICT from) { const __m128i* const HH_RESTRICT p = reinterpret_cast(from); return V16x8U(_mm_loadu_si128(p)); } template <> HH_INLINE V8x16U LoadUnaligned(const V8x16U::T* const HH_RESTRICT from) { const __m128i* const HH_RESTRICT p = reinterpret_cast(from); return V8x16U(_mm_loadu_si128(p)); } template <> HH_INLINE V4x32U LoadUnaligned(const V4x32U::T* const HH_RESTRICT from) { const __m128i* const HH_RESTRICT p = reinterpret_cast(from); return V4x32U(_mm_loadu_si128(p)); } template <> HH_INLINE V2x64U LoadUnaligned(const V2x64U::T* const HH_RESTRICT from) { const __m128i* const HH_RESTRICT p = reinterpret_cast(from); return V2x64U(_mm_loadu_si128(p)); } template <> HH_INLINE V4x32F LoadUnaligned(const V4x32F::T* const HH_RESTRICT from) { return V4x32F(_mm_loadu_ps(from)); } template <> HH_INLINE V2x64F LoadUnaligned(const V2x64F::T* const HH_RESTRICT from) { return V2x64F(_mm_loadu_pd(from)); } // "to" must be vector-aligned. template HH_INLINE void Store(const V128& v, T* const HH_RESTRICT to) { _mm_store_si128(reinterpret_cast<__m128i* HH_RESTRICT>(to), v); } HH_INLINE void Store(const V128& v, float* const HH_RESTRICT to) { _mm_store_ps(to, v); } HH_INLINE void Store(const V128& v, double* const HH_RESTRICT to) { _mm_store_pd(to, v); } template HH_INLINE void StoreUnaligned(const V128& v, T* const HH_RESTRICT to) { _mm_storeu_si128(reinterpret_cast<__m128i* HH_RESTRICT>(to), v); } HH_INLINE void StoreUnaligned(const V128& v, float* const HH_RESTRICT to) { _mm_storeu_ps(to, v); } HH_INLINE void StoreUnaligned(const V128& v, double* const HH_RESTRICT to) { _mm_storeu_pd(to, v); } // Writes directly to (aligned) memory, bypassing the cache. This is useful for // data that will not be read again in the near future. template HH_INLINE void Stream(const V128& v, T* const HH_RESTRICT to) { _mm_stream_si128(reinterpret_cast<__m128i* HH_RESTRICT>(to), v); } HH_INLINE void Stream(const V128& v, float* const HH_RESTRICT to) { _mm_stream_ps(to, v); } HH_INLINE void Stream(const V128& v, double* const HH_RESTRICT to) { _mm_stream_pd(to, v); } // Miscellaneous functions. template HH_INLINE V128 RotateLeft(const V128& v, const int count) { constexpr size_t num_bits = sizeof(T) * 8; return (v << count) | (v >> (num_bits - count)); } template HH_INLINE V128 AndNot(const V128& neg_mask, const V128& values) { return V128(_mm_andnot_si128(neg_mask, values)); } template <> HH_INLINE V128 AndNot(const V128& neg_mask, const V128& values) { return V128(_mm_andnot_ps(neg_mask, values)); } template <> HH_INLINE V128 AndNot(const V128& neg_mask, const V128& values) { return V128(_mm_andnot_pd(neg_mask, values)); } HH_INLINE V4x32F Select(const V4x32F& a, const V4x32F& b, const V4x32F& mask) { return V4x32F(_mm_blendv_ps(a, b, mask)); } HH_INLINE V2x64F Select(const V2x64F& a, const V2x64F& b, const V2x64F& mask) { return V2x64F(_mm_blendv_pd(a, b, mask)); } // Min/Max HH_INLINE V16x8U Min(const V16x8U& v0, const V16x8U& v1) { return V16x8U(_mm_min_epu8(v0, v1)); } HH_INLINE V16x8U Max(const V16x8U& v0, const V16x8U& v1) { return V16x8U(_mm_max_epu8(v0, v1)); } HH_INLINE V8x16U Min(const V8x16U& v0, const V8x16U& v1) { return V8x16U(_mm_min_epu16(v0, v1)); } HH_INLINE V8x16U Max(const V8x16U& v0, const V8x16U& v1) { return V8x16U(_mm_max_epu16(v0, v1)); } HH_INLINE V4x32U Min(const V4x32U& v0, const V4x32U& v1) { return V4x32U(_mm_min_epu32(v0, v1)); } HH_INLINE V4x32U Max(const V4x32U& v0, const V4x32U& v1) { return V4x32U(_mm_max_epu32(v0, v1)); } HH_INLINE V4x32F Min(const V4x32F& v0, const V4x32F& v1) { return V4x32F(_mm_min_ps(v0, v1)); } HH_INLINE V4x32F Max(const V4x32F& v0, const V4x32F& v1) { return V4x32F(_mm_max_ps(v0, v1)); } HH_INLINE V2x64F Min(const V2x64F& v0, const V2x64F& v1) { return V2x64F(_mm_min_pd(v0, v1)); } HH_INLINE V2x64F Max(const V2x64F& v0, const V2x64F& v1) { return V2x64F(_mm_max_pd(v0, v1)); } } // namespace HH_TARGET_NAME } // namespace highwayhash #endif // HH_DISABLE_TARGET_SPECIFIC #endif // HIGHWAYHASH_VECTOR128_H_