Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x86 sse2/avx2: rewrite loongarch immediate operand shift instruction #1247

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 35 additions & 19 deletions simde/x86/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -4564,6 +4564,10 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
the expected range, hence the discrepancy between what we allow and what
Intel specifies. Some compilers will return 0, others seem to just mask
off everything outside of the range. */
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsll_h(a, __lasx_xvreplgr2vr_h(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -4586,8 +4590,6 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_slli_epi16(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_h(0) : __lasx_xvslli_h(a, imm8 & 15))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_slli_epi16(a, imm8) \
simde_mm256_set_m128i( \
Expand All @@ -4603,6 +4605,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsll_w(a, __lasx_xvreplgr2vr_w(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -4625,8 +4631,6 @@ simde_mm256_slli_epi32 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_slli_epi32(a, imm8) (imm8 > 31 ? __lasx_xvreplgr2vr_w(0) : __lasx_xvslli_w(a, imm8 & 31))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_slli_epi32(a, imm8) \
simde_mm256_set_m128i( \
Expand All @@ -4642,6 +4646,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsll_d(a, __lasx_xvreplgr2vr_d(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -4659,8 +4667,6 @@ simde_mm256_slli_epi64 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_slli_epi64(a, imm8) (imm8 > 63 ? __lasx_xvreplgr2vr_d(0) : __lasx_xvslli_d(a, imm8))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_slli_epi64(a, imm8) \
simde_mm256_set_m128i( \
Expand Down Expand Up @@ -4927,6 +4933,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_srai_epi16 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsra_h(a, __lasx_xvreplgr2vr_h((imm8 > 15 ? 15 : imm8)));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -4947,8 +4957,6 @@ simde_mm256_srai_epi16 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_srai_epi16(a, imm8) __lasx_xvsrai_h(a, (imm8 > 15 ? 15 : imm8))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_srai_epi16(a, imm8) \
simde_mm256_set_m128i( \
Expand All @@ -4964,6 +4972,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_srai_epi32 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsra_w(a, __lasx_xvreplgr2vr_w((imm8 > 31 ? 31 : imm8)));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -4984,8 +4996,6 @@ simde_mm256_srai_epi32 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_srai_epi32(a, imm8) __lasx_xvsrai_w(a, (imm8 > 31 ? 31 : imm8))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_srai_epi32(a, imm8) \
simde_mm256_set_m128i( \
Expand Down Expand Up @@ -5183,13 +5193,17 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_srli_epi16 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
if (imm8 > 15)
return simde_mm256_setzero_si256();

#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsrl_h(a, __lasx_xvreplgr2vr_h(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);

if (imm8 > 15)
return simde_mm256_setzero_si256();

#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8));
for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) {
Expand All @@ -5214,8 +5228,6 @@ simde_mm256_srli_epi16 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_srli_epi16(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_h(0) : __lasx_xvsrli_h(a, imm8 & 15))
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_srli_epi16(a, imm8) \
simde_mm256_set_m128i( \
Expand All @@ -5231,6 +5243,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_srli_epi32 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsrl_w(a, __lasx_xvreplgr2vr_w(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -5253,8 +5269,6 @@ simde_mm256_srli_epi32 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_srli_epi32(a, imm8) __lasx_xvsrli_w(a, imm8)
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_srli_epi32(a, imm8) \
simde_mm256_set_m128i( \
Expand All @@ -5270,6 +5284,10 @@ SIMDE_FUNCTION_ATTRIBUTES
simde__m256i
simde_mm256_srli_epi64 (simde__m256i a, const int imm8)
SIMDE_REQUIRE_RANGE(imm8, 0, 255) {
#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
return __lasx_xvsrl_d(a, __lasx_xvreplgr2vr_d(imm8));
#endif

simde__m256i_private
r_,
a_ = simde__m256i_to_private(a);
Expand All @@ -5287,8 +5305,6 @@ simde_mm256_srli_epi64 (simde__m256i a, const int imm8)
}
#if defined(SIMDE_X86_AVX2_NATIVE)
# define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
# define simde_mm256_srli_epi64(a, imm8) __lasx_xvsrli_d(a, imm8)
#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128)
# define simde_mm256_srli_epi64(a, imm8) \
simde_mm256_set_m128i( \
Expand Down
46 changes: 23 additions & 23 deletions simde/x86/sse2.h
Original file line number Diff line number Diff line change
Expand Up @@ -6158,12 +6158,12 @@ simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) {
if (count_.u64[0] > 15)
return simde_mm_setzero_si128();

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsll_h(a_.lsx_i64, __lsx_vreplgr2vr_h(count_.u64[0]));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.u16 = (a_.u16 << count_.u64[0]);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0])));
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vslli_h(a_.lsx_i64, count_.u64[0]);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0));
#else
Expand Down Expand Up @@ -6194,12 +6194,12 @@ simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) {
if (count_.u64[0] > 31)
return simde_mm_setzero_si128();

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, __lsx_vreplgr2vr_w(count_.u64[0]));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.u32 = (a_.u32 << count_.u64[0]);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0])));
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vslli_w(a_.lsx_i64, count_.u64[0]);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0));
#else
Expand Down Expand Up @@ -6561,7 +6561,9 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8)
r_,
a_ = simde__m128i_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsll_h(a_.lsx_i64, __lsx_vreplgr2vr_h(imm8));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff);
#else
const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8;
Expand Down Expand Up @@ -6589,8 +6591,6 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8)
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
#define simde_mm_slli_epi16(a, imm8) \
((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_slli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
Expand All @@ -6607,7 +6607,9 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8)
r_,
a_ = simde__m128i_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, __lsx_vreplgr2vr_w(imm8));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.i32 = a_.i32 << imm8;
#else
SIMDE_VECTORIZE
Expand Down Expand Up @@ -6646,8 +6648,6 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8)
} \
ret; \
}))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_slli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
Expand All @@ -6664,7 +6664,9 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8)
r_,
a_ = simde__m128i_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsll_d(a_.lsx_i64, __lsx_vreplgr2vr_d(imm8));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.i64 = a_.i64 << imm8;
#else
SIMDE_VECTORIZE
Expand All @@ -6688,8 +6690,6 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8)
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
#define simde_mm_slli_epi64(a, imm8) \
((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_slli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
Expand All @@ -6706,7 +6706,9 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8)
r_,
a_ = simde__m128i_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsrl_h(a_.lsx_i64, __lsx_vreplgr2vr_h(imm8));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8);
#else
SIMDE_VECTORIZE
Expand All @@ -6733,8 +6735,6 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8)
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
#define simde_mm_srli_epi16(a, imm8) \
((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8)))))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_srli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
Expand All @@ -6751,7 +6751,9 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8)
r_,
a_ = simde__m128i_to_private(a);

#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
#if defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsrl_w(a_.lsx_i64, __lsx_vreplgr2vr_w(imm8));
#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff);
#else
SIMDE_VECTORIZE
Expand Down Expand Up @@ -6790,8 +6792,6 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8)
} \
ret; \
}))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_srli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
Expand All @@ -6810,6 +6810,8 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8)

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
r_.lsx_i64 = __lsx_vsrl_d(a_.lsx_i64, __lsx_vreplgr2vr_d(imm8));
#else
#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8);
Expand All @@ -6836,8 +6838,6 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8)
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
#define simde_mm_srli_epi64(a, imm8) \
((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0))
#elif defined(SIMDE_LOONGARCH_LSX_NATIVE)
#define simde_mm_srli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63))))
#endif
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
Expand Down
Loading