// Copyright 2016-2019 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef HIGHWAYHASH_VECTOR128_NEON_H_ #define HIGHWAYHASH_VECTOR128_NEON_H_ // Defines SIMD vector classes ("V2x64U") with overloaded arithmetic operators: // const V2x64U masked_sum = (a + b) & m; // This is shorter and more readable than compiler intrinsics: // const uint64x2_t masked_sum = vandq_u64(vaddq_u64(a, b), m); // There is typically no runtime cost for these abstractions. // // The naming convention is VNxBBT where N is the number of lanes, BB the // number of bits per lane and T is the lane type: unsigned integer (U), // signed integer (I), or floating-point (F). (Note: Floating point vectors are // currently disabled). // WARNING: this is a "restricted" header because it is included from // translation units compiled with different flags. This header and its // dependencies must not define any function unless it is static inline and/or // within namespace HH_TARGET_NAME. See arch_specific.h for details. #include "highwayhash/arch_specific.h" #include "highwayhash/compiler_specific.h" #include "highwayhash/hh_types.h" // For auto-dependency generation, we need to include all headers but not their // contents. #ifndef HH_DISABLE_TARGET_SPECIFIC #include // NEON namespace highwayhash { // To prevent ODR violations when including this from multiple translation // units (TU) that are compiled with different flags, the contents must reside // in a namespace whose name is unique to the TU. NOTE: this behavior is // incompatible with precompiled modules and requires textual inclusion instead. namespace HH_TARGET_NAME { // Polyfills for ARMv7. ARMv7 lacks a few important instructions which were added // in aarch64, so we simulate them with these polyfills. #if !defined(__aarch64__) && !defined(__arm64__) #ifndef vqtbl1q_u8 // aarch64 allows a 128-bit lookup table with vtbl. ARMv7 needs to do two // lookups for the same effect. HH_INLINE uint8x16_t vqtbl1q_u8(uint8x16_t t, uint8x16_t idx) { // Prevents scalarizing by GCC. // NOT PORTABLE TO AARCH64! aarch64 uses two separate vectors instead of packing // them, so this reinterpret_cast would fail! uint8x8x2_t split = *reinterpret_cast(&t); return vcombine_u8( vtbl2_u8(split, vget_low_u8(idx)), vtbl2_u8(split, vget_high_u8(idx)) ); } #endif #ifndef vnegq_s64 // ARMv7 lacks this for some weird reason. HH_INLINE int64x2_t vnegq_s64(int64x2_t v) { const int64x2_t zero = vdupq_n_u64(0); return vsubq_s64(zero, v); } #endif #ifndef vceqq_u64 HH_INLINE uint64x2_t vceqq_u64(uint64x2_t v1, uint64x2_t v2) { uint32x4_t comparison = vceqq_u32(vreinterpretq_u32_u64(v1), vreinterpretq_u32_u64(v2)); return vreinterpretq_u64_u32(vandq_u32(comparison, vrev64q_u32(comparison))); } #endif // vnegq_s64 #endif // !__aarch64__ && !__arm64__ // Pseudo-instructions. // _mm_storel_epi64 HH_INLINE void vst1q_low_u64(uint64_t* a, uint64x2_t b) { uint64x1_t lo = vget_low_u64(b); vst1_u8(reinterpret_cast(a), vreinterpret_u8_u64(lo)); } // _mm_loadl_epi64 HH_INLINE uint64x2_t vld1q_low_u64(const uint64_t* p) { return vcombine_u64( vld1_u64(p), vdup_n_u64(0) ); } // _mm_slli_si128 (almost) #define vshlq_n_u128(a, imm) ( \ vreinterpretq_u64_u8( \ vextq_u8( \ vdupq_n_u8(0), \ vreinterpretq_u8_u64(a), \ 16 - (imm) \ ) \ ) \ ) // Adapted from xsimd. // arm_neon.h requires literals for their parameters in many // functions, such as vshrq_n_u64, and it will complain even when // the value is known at compile-time. #define EXPAND(...) __VA_ARGS__ #define CASE(op, i, ...) \ case i: v_ = op(__VA_ARGS__, i); break; #define INTRINSIC_REPEAT_8_0(op, addx, ...) \ CASE(EXPAND(op), 1 + addx, __VA_ARGS__); \ CASE(EXPAND(op), 2 + addx, __VA_ARGS__); \ CASE(EXPAND(op), 3 + addx, __VA_ARGS__); \ CASE(EXPAND(op), 4 + addx, __VA_ARGS__); \ CASE(EXPAND(op), 5 + addx, __VA_ARGS__); \ CASE(EXPAND(op), 6 + addx, __VA_ARGS__); \ CASE(EXPAND(op), 7 + addx, __VA_ARGS__); #define INTRINSIC_REPEAT_8_N(op, addx, ...) \ CASE(EXPAND(op), 0 + addx, __VA_ARGS__); \ INTRINSIC_REPEAT_8_0(op, addx, __VA_ARGS__); #define INTRINSIC_REPEAT_8(op, ...) \ INTRINSIC_REPEAT_8_0(op, 0, __VA_ARGS__); #define INTRINSIC_REPEAT_16_0(op, addx,...) \ INTRINSIC_REPEAT_8_0(op, 0 + addx, __VA_ARGS__); \ INTRINSIC_REPEAT_8_N(op, 8 + addx, __VA_ARGS__); #define INTRINSIC_REPEAT_16_N(op, addx, ...) \ INTRINSIC_REPEAT_8_N(op, 0 + addx, __VA_ARGS__); \ INTRINSIC_REPEAT_8_N(op, 8 + addx, __VA_ARGS__); #define INTRINSIC_EACH_16(op, ...) \ INTRINSIC_REPEAT_16_0(op, 0, __VA_ARGS__); #define INTRINSIC_REPEAT_32_0(op, addx, ...) \ INTRINSIC_REPEAT_16_0(op, 0 + addx, __VA_ARGS__); \ INTRINSIC_REPEAT_16_N(op, 16 + addx, __VA_ARGS__); #define INTRINSIC_REPEAT_32_N(op, addx, ...) \ INTRINSIC_REPEAT_16_N(op, 0 + addx, __VA_ARGS__); \ INTRINSIC_REPEAT_16_N(op, 16 + addx, __VA_ARGS__); #define INTRINSIC_EACH_32(op, ...) \ INTRINSIC_REPEAT_32_0(op, 0, __VA_ARGS__); #define INTRINSIC_EACH_64(op, ...) \ INTRINSIC_REPEAT_32_0(op, 0, __VA_ARGS__); \ INTRINSIC_REPEAT_32_N(op, 32, __VA_ARGS__); // Primary template for 128-bit SSE4.1 vectors; only specializations are used. template class V128 {}; template <> class V128 { public: using Intrinsic = uint8x16_t; using T = uint8_t; static constexpr size_t N = 16; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Sets all lanes to the same value. HH_INLINE explicit V128(T i) : v_(vdupq_n_u8(i)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} // C-style cast because vector casts are stupid on NEON. template HH_INLINE explicit V128(const V128& other) : v_((const uint8x16_t)(other)) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } // There are no greater-than comparison instructions for unsigned T. HH_INLINE V128 operator==(const V128& other) const { return V128(vceqq_u8(v_, other.v_)); } HH_INLINE V128& operator+=(const V128& other) { v_ = vaddq_u8(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = vsubq_u8(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = vandq_u8(v_, other.v_); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = vorrq_u8(v_, other.v_); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = veorq_u8(v_, other.v_); return *this; } HH_INLINE V128& AndNot(const Intrinsic& neg_mask) { v_ = vbicq_u8(v_, neg_mask); return *this; } private: Intrinsic v_; }; template <> class V128 { public: using Intrinsic = uint16x8_t; using T = uint16_t; static constexpr size_t N = 8; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Lane 0 (p_0) is the lowest. HH_INLINE V128(T p_7, T p_6, T p_5, T p_4, T p_3, T p_2, T p_1, T p_0) { alignas(16) const uint16_t data[8] = { p_0, p_1, p_2, p_3, p_4, p_5, p_6, p_7 }; v_ = vld1q_u16(data); } // Broadcasts i to all lanes (usually by loading from memory). HH_INLINE explicit V128(T i) : v_(vdupq_n_u16(i)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_(other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } // There are no greater-than comparison instructions for unsigned T. HH_INLINE V128 operator==(const V128& other) const { return V128(vceqq_u16(v_, other.v_)); } HH_INLINE V128& operator+=(const V128& other) { v_ = vaddq_u16(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = vsubq_u16(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = vandq_u16(v_, other.v_); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = vorrq_u16(v_, other.v_); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = veorq_u16(v_, other.v_); return *this; } HH_INLINE V128& operator<<=(const int count) { switch (count) { INTRINSIC_EACH_16(vshlq_n_u16, v_) } return *this; } HH_INLINE V128& operator<<=(const Intrinsic& count) { v_ = vshlq_u16(v_, vreinterpretq_s16_u16(count)); return *this; } HH_INLINE V128& operator>>=(const int count) { switch (count) { INTRINSIC_EACH_16(vshrq_n_u16, v_) } return *this; } HH_INLINE V128& operator>>=(const Intrinsic& count) { v_ = vshlq_u16(v_, vnegq_s16(vreinterpretq_s16_u16(count))); return *this; } HH_INLINE V128& ShiftRightInsert(const Intrinsic& value, const int count) { switch (count) { INTRINSIC_EACH_16(vsriq_n_u16, v_, value) } return *this; } HH_INLINE V128& AndNot(const Intrinsic& neg_mask) { v_ = vbicq_u16(v_, neg_mask); return *this; } private: Intrinsic v_; }; template <> class V128 { public: using Intrinsic = uint32x4_t; using T = uint32_t; static constexpr size_t N = 4; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Lane 0 (p_0) is the lowest. HH_INLINE V128(T p_3, T p_2, T p_1, T p_0) { alignas(16) const T data[4] = { p_0, p_1, p_2, p_3 }; v_ = vld1q_u32(data); } // Broadcasts i to all lanes (usually by loading from memory). HH_INLINE explicit V128(T i) : v_(vdupq_n_u32(i)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_((const uint32x4_t)other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } // There are no greater-than comparison instructions for unsigned T. HH_INLINE V128 operator==(const V128& other) const { return V128(vceqq_u32(v_, other.v_)); } HH_INLINE V128& operator+=(const V128& other) { v_ = vaddq_u32(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = vsubq_u32(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = vandq_u32(v_, other.v_); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = vorrq_u32(v_, other.v_); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = veorq_u32(v_, other.v_); return *this; } HH_INLINE V128& operator<<=(const int count) { switch (count) { INTRINSIC_EACH_32(vshlq_n_u32, v_) } return *this; } HH_INLINE V128& operator<<=(const Intrinsic& count) { v_ = vshlq_u32(v_, vreinterpretq_s32_u32(count)); return *this; } HH_INLINE V128& operator>>=(const int count) { switch (count) { INTRINSIC_EACH_32(vshrq_n_u32, v_) } return *this; } HH_INLINE V128& operator>>=(const Intrinsic& count) { v_ = vshlq_u32(v_, vnegq_s32(vreinterpretq_s32_u32(count))); return *this; } HH_INLINE V128& ShiftRightInsert(const Intrinsic& value, const int count) { switch (count) { INTRINSIC_EACH_32(vsriq_n_u32, v_, value) } return *this; } HH_INLINE V128& AndNot(const Intrinsic& neg_mask) { v_ = vbicq_u32(v_, neg_mask); return *this; } private: Intrinsic v_; }; template <> class V128 { public: using Intrinsic = uint64x2_t; using T = uint64_t; static constexpr size_t N = 2; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Lane 0 (p_0) is the lowest. HH_INLINE V128(T p_1, T p_0) { alignas(16) const T data[2] = { p_0, p_1 }; v_ = vld1q_u64(data); } // Broadcasts i to all lanes (usually by loading from memory). HH_INLINE explicit V128(T i) : v_(vdupq_n_u64(i)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_(other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } // There are no greater-than comparison instructions for unsigned T. HH_INLINE V128 operator==(const V128& other) const { return V128(vceqq_u64(v_, other.v_)); } HH_INLINE V128& operator+=(const V128& other) { v_ = vaddq_u64(v_, other.v_); // Prevent Clang from converting to vaddhn when nearby vmovn, which // causes four spills in the main loop on ARMv7a. #ifdef __GNUC__ __asm__("" : "+w"(v_)); #endif return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = vsubq_u64(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = vandq_u64(v_, other.v_); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = vorrq_u64(v_, other.v_); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = veorq_u64(v_, other.v_); return *this; } HH_INLINE V128& operator<<=(const int count) { switch (count) { INTRINSIC_EACH_64(vshlq_n_u64, v_) } return *this; } HH_INLINE V128& operator<<=(const Intrinsic& count) { v_ = vshlq_u64(v_, vreinterpretq_s64_u64(count)); return *this; } HH_INLINE V128& operator>>=(const int count) { switch (count) { INTRINSIC_EACH_64(vshrq_n_u64, v_) } return *this; } HH_INLINE V128& operator>>=(const Intrinsic& count) { v_ = vshlq_u64(v_, vnegq_s64(vreinterpretq_s64_u64(count))); return *this; } HH_INLINE V128& AndNot(const Intrinsic& neg_mask) { v_ = vbicq_u64(v_, neg_mask); return *this; } HH_INLINE V128& ShiftRightInsert(const Intrinsic& value, const int count) { switch (count) { INTRINSIC_EACH_64(vsriq_n_u64, v_, value) } return *this; } private: Intrinsic v_; }; // TODO: Enable. For now, this is disabled for the following reasons: // 1. ARMv7a lacks float64x2_t. // 2. ARMv7a's float32x4_t is not IEE-754 compliant // 3. We don't actually use the float vectors right now. #if 0 template <> class V128 { public: using Intrinsic = float32x4_t; using T = float; static constexpr size_t N = 4; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Lane 0 (p_0) is the lowest. HH_INLINE V128(T p_3, T p_2, T p_1, T p_0) { HH_ALIGNAS(16) float tmp[4] = { p_0, p_1, p_2, p_3 }; v_ = vld1q_f32(tmp); } // Broadcasts to all lanes. HH_INLINE explicit V128(T f) : v_(vdupq_n_f32(f)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_(other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } HH_INLINE V128 operator==(const V128& other) const { return V128(vceqq_f32(v_, other.v_)); } HH_INLINE V128 operator<(const V128& other) const { return V128(vcltq_f32(v_, other.v_)); } HH_INLINE V128 operator>(const V128& other) const { return V128(vcltq_f32(other.v_, v_)); } HH_INLINE V128& operator*=(const V128& other) { v_ = vmulq_f32(v_, other.v_); return *this; } HH_INLINE V128& operator/=(const V128& other) { v_ = vdivq_f32(v_, other.v_); return *this; } HH_INLINE V128& operator+=(const V128& other) { v_ = vaddq_f32(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = vsubq_f32(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v_), vreinterpretq_u32_f32(other.v_))); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v_), vreinterpretq_u32_f32(other.v_))); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v_), vreinterpretq_u32_f32(other.v_))); return *this; } private: Intrinsic v_; }; template <> class V128 { public: using Intrinsic = float64x2_t; using T = double; static constexpr size_t N = 2; // Leaves v_ uninitialized - typically used for output parameters. HH_INLINE V128() {} // Lane 0 (p_0) is the lowest. HH_INLINE V128(T p_1, T p_0) { HH_ALIGNAS(16) double tmp[2] = { p_0, p_1 }; v_ = vld1q_f64(tmp); } // Broadcasts to all lanes. HH_INLINE explicit V128(T f) : v_(vdupq_n_f64(f)) {} // Copy from other vector. HH_INLINE explicit V128(const V128& other) : v_(other.v_) {} template HH_INLINE explicit V128(const V128& other) : v_(other) {} HH_INLINE V128& operator=(const V128& other) { v_ = other.v_; return *this; } // Convert from/to intrinsics. HH_INLINE V128(const Intrinsic& v) : v_(v) {} HH_INLINE V128& operator=(const Intrinsic& v) { v_ = v; return *this; } HH_INLINE operator Intrinsic() const { return v_; } HH_INLINE V128 operator==(const V128& other) const { return V128(vceqq_f64(v_, other.v_)); } HH_INLINE V128 operator<(const V128& other) const { return V128(vcltq_f64(v_, other.v_)); } HH_INLINE V128 operator>(const V128& other) const { return V128(vcltq_f64(other.v_, v_)); } HH_INLINE V128& operator*=(const V128& other) { v_ = vmulq_f64(v_, other.v_); return *this; } HH_INLINE V128& operator/=(const V128& other) { v_ = vdivq_f64(v_, other.v_); return *this; } HH_INLINE V128& operator+=(const V128& other) { v_ = vaddq_f64(v_, other.v_); return *this; } HH_INLINE V128& operator-=(const V128& other) { v_ = vsubq_f64(v_, other.v_); return *this; } HH_INLINE V128& operator&=(const V128& other) { v_ = vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(v_), vreinterpretq_u64_f64(other.v_))); return *this; } HH_INLINE V128& operator|=(const V128& other) { v_ = vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(v_), vreinterpretq_u64_f64(other.v_))); return *this; } HH_INLINE V128& operator^=(const V128& other) { v_ = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v_), vreinterpretq_u64_f64(other.v_))); return *this; } private: Intrinsic v_; }; #endif // Nonmember functions for any V128 via member functions. template HH_INLINE V128 operator*(const V128& left, const V128& right) { V128 t(left); return t *= right; } template HH_INLINE V128 operator/(const V128& left, const V128& right) { V128 t(left); return t /= right; } template HH_INLINE V128 operator+(const V128& left, const V128& right) { V128 t(left); return t += right; } template HH_INLINE V128 operator-(const V128& left, const V128& right) { V128 t(left); return t -= right; } template HH_INLINE V128 operator&(const V128& left, const V128& right) { V128 t(left); return t &= right; } template HH_INLINE V128 operator|(const V128& left, const V128& right) { V128 t(left); return t |= right; } template HH_INLINE V128 operator^(const V128& left, const V128& right) { V128 t(left); return t ^= right; } template HH_INLINE V128 operator<<(const V128& v, const int count) { V128 t(v); return t <<= count; } template HH_INLINE V128 operator>>(const V128& v, const int count) { V128 t(v); return t >>= count; } using V16x8U = V128; using V8x16U = V128; using V4x32U = V128; using V2x64U = V128; using V4x32F = V128; using V2x64F = V128; // Load/Store for any V128. // We differentiate between targets' vector types via template specialization. // Calling Load(floats) is more natural than Load(V8x32F(), floats) and may // generate better code in unoptimized builds. Only declare the primary // templates to avoid needing mutual exclusion with vector256. template HH_INLINE V Load(const typename V::T* const HH_RESTRICT from); template HH_INLINE V LoadUnaligned(const typename V::T* const HH_RESTRICT from); // "from" must be vector-aligned. #ifdef __GNUC__ #define HH_ALIGN(x) __builtin_assume_aligned((x), 16) #else #define HH_ALIGN(x) (x) #endif template <> HH_INLINE V16x8U Load(const V16x8U::T* const HH_RESTRICT from) { const uint8_t* const HH_RESTRICT p = reinterpret_cast(HH_ALIGN( from)); return V16x8U(vld1q_u8(p)); } template <> HH_INLINE V8x16U Load(const V8x16U::T* const HH_RESTRICT from) { const uint16_t* const HH_RESTRICT p = reinterpret_cast (HH_ALIGN(from)); return V8x16U(vld1q_u16(p)); } template <> HH_INLINE V4x32U Load(const V4x32U::T* const HH_RESTRICT from) { const uint32_t* const HH_RESTRICT p = reinterpret_cast (HH_ALIGN(from)); return V4x32U(vld1q_u32(p)); } template <> HH_INLINE V2x64U Load(const V2x64U::T* const HH_RESTRICT from) { const uint64_t* const HH_RESTRICT p = reinterpret_cast (HH_ALIGN(from)); return V2x64U(vld1q_u64(p)); } #if 0 template <> HH_INLINE V4x32F Load(const V4x32F::T* const HH_RESTRICT from) { return V4x32F(vld1q_f32(from)); } template <> HH_INLINE V2x64F Load(const V2x64F::T* const HH_RESTRICT from) { return V2x64F(vld1q_f64(from)); } #endif // GCC for ARM 32-bit flips out on unaligned reads after a cast. // Only vld1q_u8 is safe on unaligned pointers. template <> HH_INLINE V16x8U LoadUnaligned(const V16x8U::T* const HH_RESTRICT from) { const uint8_t* const HH_RESTRICT p = reinterpret_cast(from); return V16x8U(vld1q_u8(p)); } template <> HH_INLINE V8x16U LoadUnaligned(const V8x16U::T* const HH_RESTRICT from) { const uint8_t* const HH_RESTRICT p = reinterpret_cast(from); return V8x16U(vreinterpretq_u16_u8(vld1q_u8(p))); } template <> HH_INLINE V4x32U LoadUnaligned(const V4x32U::T* const HH_RESTRICT from) { const uint8_t* const HH_RESTRICT p = reinterpret_cast(from); return V4x32U(vreinterpretq_u32_u8(vld1q_u8(p))); } template <> HH_INLINE V2x64U LoadUnaligned(const V2x64U::T* const HH_RESTRICT from) { const uint8_t* const HH_RESTRICT p = reinterpret_cast(from); return V2x64U(vreinterpretq_u64_u8(vld1q_u8(p))); } #if 0 template <> HH_INLINE V4x32F LoadUnaligned(const V4x32F::T* const HH_RESTRICT from) { return V4x32F(vld1q_f32(from)); } template <> HH_INLINE V2x64F LoadUnaligned(const V2x64F::T* const HH_RESTRICT from) { return V2x64F(vld1q_f64(from)); } #endif // "to" must be vector-aligned. template HH_INLINE void Store(const V128& v, T* const HH_RESTRICT to); template<> HH_INLINE void Store(const V128& v, uint8_t* HH_RESTRICT to) { uint8_t* const HH_RESTRICT p = reinterpret_cast(HH_ALIGN(to)); vst1q_u8(p, v); } template<> HH_INLINE void Store(const V128& v, uint16_t* const HH_RESTRICT to) { uint16_t* const HH_RESTRICT p = reinterpret_cast(HH_ALIGN(to)); vst1q_u16(p, v); } template<> HH_INLINE void Store(const V128& v, uint32_t* const HH_RESTRICT to) { uint32_t* const HH_RESTRICT p = reinterpret_cast(HH_ALIGN(to)); vst1q_u32(p, v); } template<> HH_INLINE void Store(const V128& v, uint64_t* const HH_RESTRICT to) { uint64_t* const HH_RESTRICT p = reinterpret_cast(HH_ALIGN(to)); vst1q_u64(p, v); } #undef HH_ALIGN #if 0 HH_INLINE void Store(const V128& v, float* const HH_RESTRICT to) { vst1q_f32(to, v); } HH_INLINE void Store(const V128& v, double* const HH_RESTRICT to) { vst1q_f64(to, v); } #endif template HH_INLINE void StoreUnaligned(const V128& v, T* const HH_RESTRICT to); template<> HH_INLINE void StoreUnaligned(const V128& v, uint8_t* const HH_RESTRICT to) { vst1q_u8(to, v); } template<> HH_INLINE void StoreUnaligned(const V128& v, uint16_t* const HH_RESTRICT to) { vst1q_u8(reinterpret_cast(to), vreinterpretq_u8_u16(v)); } template<> HH_INLINE void StoreUnaligned(const V128& v, uint32_t* const HH_RESTRICT to) { vst1q_u8(reinterpret_cast(to), vreinterpretq_u8_u32(v)); } template<> HH_INLINE void StoreUnaligned(const V128& v, uint64_t* const HH_RESTRICT to) { vst1q_u8(reinterpret_cast(to), vreinterpretq_u8_u64(v)); } #if 0 HH_INLINE void StoreUnaligned(const V128& v, float* const HH_RESTRICT to) { _mm_storeu_ps(to, v); } HH_INLINE void StoreUnaligned(const V128& v, double* const HH_RESTRICT to) { _mm_storeu_pd(to, v); } #endif // TODO: Enable. #if 0 // Writes directly to (aligned) memory, bypassing the cache. This is useful for // data that will not be read again in the near future. template HH_INLINE void Stream(const V128& v, T* const HH_RESTRICT to) { _mm_stream_si128(reinterpret_cast<__m128i* HH_RESTRICT>(to), v); } HH_INLINE void Stream(const V128& v, float* const HH_RESTRICT to) { _mm_stream_ps(to, v); } HH_INLINE void Stream(const V128& v, double* const HH_RESTRICT to) { _mm_stream_pd(to, v); } #endif // Miscellaneous functions. template HH_INLINE V128 RotateLeft(const V128& v, const int count) { const size_t num_bits = sizeof(T) * 8; const V128& tmp = v << count; return tmp.ShiftRightAccumulate(v, num_bits - count); } template HH_INLINE V128 AndNot(const V128& neg_mask, const V128& values) { V128 tmp = values; return tmp.AndNot(neg_mask); } #if 0 template <> HH_INLINE V128 AndNot(const V128& neg_mask, const V128& values) { return V128(_mm_andnot_ps(neg_mask, values)); } template <> HH_INLINE V128 AndNot(const V128& neg_mask, const V128& values) { return V128(_mm_andnot_pd(neg_mask, values)); } #endif HH_INLINE V4x32U Select(const V4x32U& a, const V4x32U& b, const V4x32U& mask) { return V4x32U(vbslq_u32(mask, a, b)); } HH_INLINE V2x64U Select(const V2x64U& a, const V2x64U& b, const V2x64U& mask) { return V2x64U(vbslq_u64(mask, a, b)); } // Min/Max HH_INLINE V16x8U Min(const V16x8U& v0, const V16x8U& v1) { return V16x8U(vminq_u8(v0, v1)); } HH_INLINE V16x8U Max(const V16x8U& v0, const V16x8U& v1) { return V16x8U(vmaxq_u8(v0, v1)); } HH_INLINE V8x16U Min(const V8x16U& v0, const V8x16U& v1) { return V8x16U(vminq_u16(v0, v1)); } HH_INLINE V8x16U Max(const V8x16U& v0, const V8x16U& v1) { return V8x16U(vmaxq_u16(v0, v1)); } HH_INLINE V4x32U Min(const V4x32U& v0, const V4x32U& v1) { return V4x32U(vminq_u32(v0, v1)); } HH_INLINE V4x32U Max(const V4x32U& v0, const V4x32U& v1) { return V4x32U(vmaxq_u32(v0, v1)); } #if 0 HH_INLINE V4x32F Min(const V4x32F& v0, const V4x32F& v1) { return V4x32F(vminq_f32(v0, v1)); } HH_INLINE V4x32F Max(const V4x32F& v0, const V4x32F& v1) { return V4x32F(vmaxq_f32(v0, v1)); } HH_INLINE V2x64F Min(const V2x64F& v0, const V2x64F& v1) { return V2x64F(vminq_f64(v0, v1)); } HH_INLINE V2x64F Max(const V2x64F& v0, const V2x64F& v1) { return V2x64F(vmaxq_f64(v0, v1)); } #endif } // namespace HH_TARGET_NAME } // namespace highwayhash #endif // HH_DISABLE_TARGET_SPECIFIC #endif // HIGHWAYHASH_VECTOR128_H_