diff --git a/crates/generate/src/generate.rs b/crates/generate/src/generate.rs index f1cf63f..ca2b5d5 100644 --- a/crates/generate/src/generate.rs +++ b/crates/generate/src/generate.rs @@ -130,7 +130,7 @@ fn generate_body_soft_arch(intr: &Intrinsic) -> Result { let args = intr.parameter.iter().map(|param| -> syn::Expr { let name = ident_opt_s(¶m.varname).unwrap(); - syn::parse_quote! { #name } + syn::parse_quote! { #name as _ } }); block.stmts.push(syn::parse_quote! { @@ -200,54 +200,49 @@ fn random_value(ty: &str, rng: &mut SmallRng) -> Result { _mm_setr_epi16(#(#args),*) } } - _ => bail!("unknown type: {ty}"), + _ => bail!("unknown type for random value: {ty}"), }) } #[derive(Clone, Copy, PartialEq, Debug)] enum Type { Vector(VectorType), - Scalar { - /// Some parameters have C types that are signed, while their `etype` is not. - c_is_signed: bool, - elemty: ElementType, - }, + Scalar { elemty: ElementType }, } +/// A SIMD vector type like `16xi8 (__m128i)` #[derive(Clone, Copy, PartialEq, Debug)] struct VectorType { + /// The amount of lanes, `16` in `16xi8 (__m128i)`. lanes: u64, + /// The type of a single lane, `i8` in `16xi8 (__m128i)`. elem: ElementType, + /// The raw Rust/C type, `__m128i` in `16xi8 (__m128i)`. raw_type: &'static str, } +/// A single element in a vector. +/// For example in `16xi8 (__m128i)`, it would be `i8` (we do not care about signedness). #[derive(Clone, Copy, PartialEq, Debug)] struct ElementType { - is_signed: bool, width: u64, } impl Type { fn of(etype: &str, ty: &str) -> Result { - let (etype_signed, etype_width) = match etype { - "SI8" => (true, 8), - "SI16" => (true, 16), - "SI32" => (true, 32), - "UI8" => (false, 8), - "UI16" => (false, 16), - "UI32" => (false, 32), - "UI64" => (false, 64), + let etype_width = match etype { + "SI8" => 8, + "SI16" => 16, + "SI32" => 32, + "UI8" => 8, + "UI16" => 16, + "UI32" => 32, + "UI64" => 64, _ => bail!("unknown element type: {etype}"), }; - let elem = ElementType { - is_signed: etype_signed, - width: etype_width, - }; + let elem = ElementType { width: etype_width }; - let scalar = |sign| Type::Scalar { - c_is_signed: sign, - elemty: elem, - }; + let scalar = Type::Scalar { elemty: elem }; Ok(match ty { "__m128i" => Type::Vector(VectorType { @@ -255,10 +250,10 @@ impl Type { elem, raw_type: "__m128i", }), - "char" => scalar(true), - "short" => scalar(true), - "int" => scalar(true), - "__int64" => scalar(true), + "char" => scalar, + "short" => scalar, + "int" => scalar, + "__int64" => scalar, _ => bail!("unknown type: {ty}"), }) } @@ -276,18 +271,11 @@ impl Type { }; ty } - fn expect_scalar(&self) -> ElementType { - let Self::Scalar { elemty, .. } = *self else { - panic!("expected scalar, found vector"); - }; - elemty - } } impl ElementType { fn rust_type(&self) -> String { - let pre = if self.is_signed { 'i' } else { 'u' }; - format!("{pre}{}", self.width) + format!("u{}", self.width) } } @@ -461,30 +449,8 @@ fn gen_expr_tmp( let (result, ty): (syn::Expr, _) = match expr { Expr::Int(int) => (syn::parse_quote! { #int }, None), Expr::Ident(identifier) => { - let ty = type_of_ident(&identifier); let identifier = ident(&identifier); - match ty { - Ok(Type::Scalar { - c_is_signed, - elemty, - }) if elemty.is_signed != c_is_signed => { - // intel intrinsics types kinda lie sometimes. - // _mm_setr_epi16 says the etype of the argument is UI16 (unsigned), - // while the actual type is short (signed). Do a cast to the etype, since we used that. - let from = ElementType { - is_signed: c_is_signed, - width: elemty.width, - } - .rust_type(); - let to = elemty.rust_type(); - let method = ident(&format!("cast_sign_{from}_{to}")); - ( - tmp(block, syn::parse_quote! { self.#method(#identifier) }), - None, - ) - } - _ => (syn::parse_quote! { #identifier }, None), - } + (syn::parse_quote! { #identifier }, None) } Expr::Index { lhs, idx } => { let (identifier, method, lane_idx, ty) = gen_idx("get", *lhs, *idx, type_of_ident)?; @@ -492,13 +458,7 @@ fn gen_expr_tmp( block, syn::parse_quote! { self.#method(#identifier, #lane_idx) }, ); - ( - expr, - Some(Type::Scalar { - c_is_signed: ty.elem.is_signed, - elemty: ty.elem, - }), - ) + (expr, Some(Type::Scalar { elemty: ty.elem })) } Expr::Range { .. } => todo!(), Expr::Call { function, args } => { @@ -543,24 +503,25 @@ fn gen_expr_tmp( }; syn::parse_quote! { ( #lhs #token #rhs ) } } - Some(ty) => { + Some(_ty) => { let prefix = match op { BinaryOpKind::Add => "add", BinaryOpKind::Mul => "mul", }; - let ty = ty.expect_scalar(); - let method = ident(&format!( - "ext_{}_{}64", - ty.rust_type(), - if ty.is_signed { "s" } else { "u" } - )); - let lhs_ext = tmp(block, syn::parse_quote! { self.#method(#lhs) }); - let rhs_ext = tmp(block, syn::parse_quote! { self.#method(#rhs) }); + // TODO: EXTEND somehow possibly??? ugh. + + //let ty = ty.expect_scalar(); + //let method = ident(&format!( + // "ext_{}_u64", + // ty.rust_type(), + // if ty.is_signed { "s" } else { "u" } + //)); + //let lhs_ext = tmp(block, syn::parse_quote! { self.#method(#lhs) }); + //let rhs_ext = tmp(block, syn::parse_quote! { self.#method(#rhs) }); - // TODO: EXTEND let method = ident(&format!("{prefix}_64")); - tmp(block, syn::parse_quote! { self.#method(#lhs_ext, #rhs_ext) }) + tmp(block, syn::parse_quote! { self.#method(#lhs, #rhs) }) } }; @@ -590,7 +551,7 @@ fn signature(intr: &Intrinsic, body: syn::Block) -> Result { let name = ident(&intr.name); let ret_name = ident_opt_s(&intr.ret.varname)?; - let ret_ty = ident(map_type_to_rust(intr.ret.r#type.as_ref().unwrap())); + let ret_ty = ident(map_type_to_rust_unsigned(intr.ret.r#type.as_ref().unwrap())); let args = [ syn::parse_quote! { &mut self }, @@ -599,7 +560,7 @@ fn signature(intr: &Intrinsic, body: syn::Block) -> Result { .into_iter() .chain(intr.parameter.iter().map(|param| -> syn::FnArg { let varname = ident_opt_s(¶m.varname).unwrap(); - let ty = ident(map_type_to_rust(param.r#type.as_ref().unwrap())); + let ty = ident(map_type_to_rust_unsigned(param.r#type.as_ref().unwrap())); syn::parse_quote! { #varname: Self::#ty } })); @@ -632,3 +593,14 @@ fn map_type_to_rust(ty: &str) -> &str { ty => panic!("unknown type: {ty}"), } } + +fn map_type_to_rust_unsigned(ty: &str) -> &str { + match ty { + "__m128i" => ty, + "char" => "u8", + "short" => "u16", + "int" => "u32", + "__int64" => "u64", + ty => panic!("unknown type: {ty}"), + } +} diff --git a/crates/generate/src/main.rs b/crates/generate/src/main.rs index b801f9b..837844d 100644 --- a/crates/generate/src/main.rs +++ b/crates/generate/src/main.rs @@ -92,11 +92,11 @@ const INTRINSICS_GENERATE: &[&str] = &[ ///// ///// Arithmetic ///// - "_mm_add_epi16", - "_mm_add_epi32", - "_mm_add_epi64", + // "_mm_add_epi16", + // "_mm_add_epi32", + // "_mm_add_epi64", // todo: float and __m64 stuff - "_mm_adds_epi16", + // "_mm_adds_epi16", //"_mm_adds_epi8", //"_mm_adds_epu16", //"_mm_adds_epu8", diff --git a/crates/intringen/src/x86/generated.rs b/crates/intringen/src/x86/generated.rs index 54185e1..ac43483 100644 --- a/crates/intringen/src/x86/generated.rs +++ b/crates/intringen/src/x86/generated.rs @@ -1,175 +1,80 @@ #![allow(unused_parens)] impl Intrinsics for C {} pub trait Intrinsics: super::Core { - fn _mm_add_epi16( - &mut self, - dst: &mut Self::__m128i, - a: Self::__m128i, - b: Self::__m128i, - ) { - for j in 0u64..=7u64 { - let i = (j * 16u64); - let __tmp0 = self.get_lane___m128i_u16(a, (i / 16u64)); - let __tmp1 = self.get_lane___m128i_u16(b, (i / 16u64)); - let __tmp2 = self.ext_u16_u64(__tmp0); - let __tmp3 = self.ext_u16_u64(__tmp1); - let __tmp4 = self.add_64(__tmp2, __tmp3); - self.set_lane___m128i_u16(dst, (i / 16u64), __tmp4); - } - } - fn _mm_add_epi32( - &mut self, - dst: &mut Self::__m128i, - a: Self::__m128i, - b: Self::__m128i, - ) { - for j in 0u64..=3u64 { - let i = (j * 32u64); - let __tmp0 = self.get_lane___m128i_u32(a, (i / 32u64)); - let __tmp1 = self.get_lane___m128i_u32(b, (i / 32u64)); - let __tmp2 = self.ext_u32_u64(__tmp0); - let __tmp3 = self.ext_u32_u64(__tmp1); - let __tmp4 = self.add_64(__tmp2, __tmp3); - self.set_lane___m128i_u32(dst, (i / 32u64), __tmp4); - } - } - fn _mm_add_epi64( - &mut self, - dst: &mut Self::__m128i, - a: Self::__m128i, - b: Self::__m128i, - ) { - for j in 0u64..=1u64 { - let i = (j * 64u64); - let __tmp0 = self.get_lane___m128i_u64(a, (i / 64u64)); - let __tmp1 = self.get_lane___m128i_u64(b, (i / 64u64)); - let __tmp2 = self.ext_u64_u64(__tmp0); - let __tmp3 = self.ext_u64_u64(__tmp1); - let __tmp4 = self.add_64(__tmp2, __tmp3); - self.set_lane___m128i_u64(dst, (i / 64u64), __tmp4); - } - } - fn _mm_adds_epi16( - &mut self, - dst: &mut Self::__m128i, - a: Self::__m128i, - b: Self::__m128i, - ) { - for j in 0u64..=7u64 { - let i = (j * 16u64); - let __tmp0 = self.get_lane___m128i_i16(a, (i / 16u64)); - let __tmp1 = self.get_lane___m128i_i16(b, (i / 16u64)); - let __tmp2 = self.ext_i16_s64(__tmp0); - let __tmp3 = self.ext_i16_s64(__tmp1); - let __tmp4 = self.add_64(__tmp2, __tmp3); - let __tmp5 = self.saturate16(__tmp4); - self.set_lane___m128i_u16(dst, (i / 16u64), __tmp5); - } - } - fn _mm_set_epi64x(&mut self, dst: &mut Self::__m128i, e1: Self::i64, e0: Self::i64) { - let __tmp0 = self.cast_sign_i64_u64(e0); - self.set_lane___m128i_u64(dst, 0u64, __tmp0); - let __tmp1 = self.cast_sign_i64_u64(e1); - self.set_lane___m128i_u64(dst, 1u64, __tmp1); + fn _mm_set_epi64x(&mut self, dst: &mut Self::__m128i, e1: Self::u64, e0: Self::u64) { + self.set_lane___m128i_u64(dst, 0u64, e0); + self.set_lane___m128i_u64(dst, 1u64, e1); } fn _mm_setr_epi32( &mut self, dst: &mut Self::__m128i, - e3: Self::i32, - e2: Self::i32, - e1: Self::i32, - e0: Self::i32, + e3: Self::u32, + e2: Self::u32, + e1: Self::u32, + e0: Self::u32, ) { - let __tmp0 = self.cast_sign_i32_u32(e3); - self.set_lane___m128i_u32(dst, 0u64, __tmp0); - let __tmp1 = self.cast_sign_i32_u32(e2); - self.set_lane___m128i_u32(dst, 1u64, __tmp1); - let __tmp2 = self.cast_sign_i32_u32(e1); - self.set_lane___m128i_u32(dst, 2u64, __tmp2); - let __tmp3 = self.cast_sign_i32_u32(e0); - self.set_lane___m128i_u32(dst, 3u64, __tmp3); + self.set_lane___m128i_u32(dst, 0u64, e3); + self.set_lane___m128i_u32(dst, 1u64, e2); + self.set_lane___m128i_u32(dst, 2u64, e1); + self.set_lane___m128i_u32(dst, 3u64, e0); } fn _mm_setr_epi16( &mut self, dst: &mut Self::__m128i, - e7: Self::i16, - e6: Self::i16, - e5: Self::i16, - e4: Self::i16, - e3: Self::i16, - e2: Self::i16, - e1: Self::i16, - e0: Self::i16, + e7: Self::u16, + e6: Self::u16, + e5: Self::u16, + e4: Self::u16, + e3: Self::u16, + e2: Self::u16, + e1: Self::u16, + e0: Self::u16, ) { - let __tmp0 = self.cast_sign_i16_u16(e7); - self.set_lane___m128i_u16(dst, 0u64, __tmp0); - let __tmp1 = self.cast_sign_i16_u16(e6); - self.set_lane___m128i_u16(dst, 1u64, __tmp1); - let __tmp2 = self.cast_sign_i16_u16(e5); - self.set_lane___m128i_u16(dst, 2u64, __tmp2); - let __tmp3 = self.cast_sign_i16_u16(e4); - self.set_lane___m128i_u16(dst, 3u64, __tmp3); - let __tmp4 = self.cast_sign_i16_u16(e3); - self.set_lane___m128i_u16(dst, 4u64, __tmp4); - let __tmp5 = self.cast_sign_i16_u16(e2); - self.set_lane___m128i_u16(dst, 5u64, __tmp5); - let __tmp6 = self.cast_sign_i16_u16(e1); - self.set_lane___m128i_u16(dst, 6u64, __tmp6); - let __tmp7 = self.cast_sign_i16_u16(e0); - self.set_lane___m128i_u16(dst, 7u64, __tmp7); + self.set_lane___m128i_u16(dst, 0u64, e7); + self.set_lane___m128i_u16(dst, 1u64, e6); + self.set_lane___m128i_u16(dst, 2u64, e5); + self.set_lane___m128i_u16(dst, 3u64, e4); + self.set_lane___m128i_u16(dst, 4u64, e3); + self.set_lane___m128i_u16(dst, 5u64, e2); + self.set_lane___m128i_u16(dst, 6u64, e1); + self.set_lane___m128i_u16(dst, 7u64, e0); } fn _mm_setr_epi8( &mut self, dst: &mut Self::__m128i, - e15: Self::i8, - e14: Self::i8, - e13: Self::i8, - e12: Self::i8, - e11: Self::i8, - e10: Self::i8, - e9: Self::i8, - e8: Self::i8, - e7: Self::i8, - e6: Self::i8, - e5: Self::i8, - e4: Self::i8, - e3: Self::i8, - e2: Self::i8, - e1: Self::i8, - e0: Self::i8, + e15: Self::u8, + e14: Self::u8, + e13: Self::u8, + e12: Self::u8, + e11: Self::u8, + e10: Self::u8, + e9: Self::u8, + e8: Self::u8, + e7: Self::u8, + e6: Self::u8, + e5: Self::u8, + e4: Self::u8, + e3: Self::u8, + e2: Self::u8, + e1: Self::u8, + e0: Self::u8, ) { - let __tmp0 = self.cast_sign_i8_u8(e15); - self.set_lane___m128i_u8(dst, 0u64, __tmp0); - let __tmp1 = self.cast_sign_i8_u8(e14); - self.set_lane___m128i_u8(dst, 1u64, __tmp1); - let __tmp2 = self.cast_sign_i8_u8(e13); - self.set_lane___m128i_u8(dst, 2u64, __tmp2); - let __tmp3 = self.cast_sign_i8_u8(e12); - self.set_lane___m128i_u8(dst, 3u64, __tmp3); - let __tmp4 = self.cast_sign_i8_u8(e11); - self.set_lane___m128i_u8(dst, 4u64, __tmp4); - let __tmp5 = self.cast_sign_i8_u8(e10); - self.set_lane___m128i_u8(dst, 5u64, __tmp5); - let __tmp6 = self.cast_sign_i8_u8(e9); - self.set_lane___m128i_u8(dst, 6u64, __tmp6); - let __tmp7 = self.cast_sign_i8_u8(e8); - self.set_lane___m128i_u8(dst, 7u64, __tmp7); - let __tmp8 = self.cast_sign_i8_u8(e7); - self.set_lane___m128i_u8(dst, 8u64, __tmp8); - let __tmp9 = self.cast_sign_i8_u8(e6); - self.set_lane___m128i_u8(dst, 9u64, __tmp9); - let __tmp10 = self.cast_sign_i8_u8(e5); - self.set_lane___m128i_u8(dst, 10u64, __tmp10); - let __tmp11 = self.cast_sign_i8_u8(e4); - self.set_lane___m128i_u8(dst, 11u64, __tmp11); - let __tmp12 = self.cast_sign_i8_u8(e3); - self.set_lane___m128i_u8(dst, 12u64, __tmp12); - let __tmp13 = self.cast_sign_i8_u8(e2); - self.set_lane___m128i_u8(dst, 13u64, __tmp13); - let __tmp14 = self.cast_sign_i8_u8(e1); - self.set_lane___m128i_u8(dst, 14u64, __tmp14); - let __tmp15 = self.cast_sign_i8_u8(e0); - self.set_lane___m128i_u8(dst, 15u64, __tmp15); + self.set_lane___m128i_u8(dst, 0u64, e15); + self.set_lane___m128i_u8(dst, 1u64, e14); + self.set_lane___m128i_u8(dst, 2u64, e13); + self.set_lane___m128i_u8(dst, 3u64, e12); + self.set_lane___m128i_u8(dst, 4u64, e11); + self.set_lane___m128i_u8(dst, 5u64, e10); + self.set_lane___m128i_u8(dst, 6u64, e9); + self.set_lane___m128i_u8(dst, 7u64, e8); + self.set_lane___m128i_u8(dst, 8u64, e7); + self.set_lane___m128i_u8(dst, 9u64, e6); + self.set_lane___m128i_u8(dst, 10u64, e5); + self.set_lane___m128i_u8(dst, 11u64, e4); + self.set_lane___m128i_u8(dst, 12u64, e3); + self.set_lane___m128i_u8(dst, 13u64, e2); + self.set_lane___m128i_u8(dst, 14u64, e1); + self.set_lane___m128i_u8(dst, 15u64, e0); } fn _mm_packs_epi16( &mut self, @@ -177,54 +82,54 @@ pub trait Intrinsics: super::Core { a: Self::__m128i, b: Self::__m128i, ) { - let __tmp0 = self.get_lane___m128i_i16(a, 0u64); + let __tmp0 = self.get_lane___m128i_u16(a, 0u64); let __tmp1 = self.saturate8(__tmp0); - self.set_lane___m128i_i8(dst, 0u64, __tmp1); - let __tmp2 = self.get_lane___m128i_i16(a, 1u64); + self.set_lane___m128i_u8(dst, 0u64, __tmp1); + let __tmp2 = self.get_lane___m128i_u16(a, 1u64); let __tmp3 = self.saturate8(__tmp2); - self.set_lane___m128i_i8(dst, 1u64, __tmp3); - let __tmp4 = self.get_lane___m128i_i16(a, 2u64); + self.set_lane___m128i_u8(dst, 1u64, __tmp3); + let __tmp4 = self.get_lane___m128i_u16(a, 2u64); let __tmp5 = self.saturate8(__tmp4); - self.set_lane___m128i_i8(dst, 2u64, __tmp5); - let __tmp6 = self.get_lane___m128i_i16(a, 3u64); + self.set_lane___m128i_u8(dst, 2u64, __tmp5); + let __tmp6 = self.get_lane___m128i_u16(a, 3u64); let __tmp7 = self.saturate8(__tmp6); - self.set_lane___m128i_i8(dst, 3u64, __tmp7); - let __tmp8 = self.get_lane___m128i_i16(a, 4u64); + self.set_lane___m128i_u8(dst, 3u64, __tmp7); + let __tmp8 = self.get_lane___m128i_u16(a, 4u64); let __tmp9 = self.saturate8(__tmp8); - self.set_lane___m128i_i8(dst, 4u64, __tmp9); - let __tmp10 = self.get_lane___m128i_i16(a, 5u64); + self.set_lane___m128i_u8(dst, 4u64, __tmp9); + let __tmp10 = self.get_lane___m128i_u16(a, 5u64); let __tmp11 = self.saturate8(__tmp10); - self.set_lane___m128i_i8(dst, 5u64, __tmp11); - let __tmp12 = self.get_lane___m128i_i16(a, 6u64); + self.set_lane___m128i_u8(dst, 5u64, __tmp11); + let __tmp12 = self.get_lane___m128i_u16(a, 6u64); let __tmp13 = self.saturate8(__tmp12); - self.set_lane___m128i_i8(dst, 6u64, __tmp13); - let __tmp14 = self.get_lane___m128i_i16(a, 7u64); + self.set_lane___m128i_u8(dst, 6u64, __tmp13); + let __tmp14 = self.get_lane___m128i_u16(a, 7u64); let __tmp15 = self.saturate8(__tmp14); - self.set_lane___m128i_i8(dst, 7u64, __tmp15); - let __tmp16 = self.get_lane___m128i_i16(b, 0u64); + self.set_lane___m128i_u8(dst, 7u64, __tmp15); + let __tmp16 = self.get_lane___m128i_u16(b, 0u64); let __tmp17 = self.saturate8(__tmp16); - self.set_lane___m128i_i8(dst, 8u64, __tmp17); - let __tmp18 = self.get_lane___m128i_i16(b, 1u64); + self.set_lane___m128i_u8(dst, 8u64, __tmp17); + let __tmp18 = self.get_lane___m128i_u16(b, 1u64); let __tmp19 = self.saturate8(__tmp18); - self.set_lane___m128i_i8(dst, 9u64, __tmp19); - let __tmp20 = self.get_lane___m128i_i16(b, 2u64); + self.set_lane___m128i_u8(dst, 9u64, __tmp19); + let __tmp20 = self.get_lane___m128i_u16(b, 2u64); let __tmp21 = self.saturate8(__tmp20); - self.set_lane___m128i_i8(dst, 10u64, __tmp21); - let __tmp22 = self.get_lane___m128i_i16(b, 3u64); + self.set_lane___m128i_u8(dst, 10u64, __tmp21); + let __tmp22 = self.get_lane___m128i_u16(b, 3u64); let __tmp23 = self.saturate8(__tmp22); - self.set_lane___m128i_i8(dst, 11u64, __tmp23); - let __tmp24 = self.get_lane___m128i_i16(b, 4u64); + self.set_lane___m128i_u8(dst, 11u64, __tmp23); + let __tmp24 = self.get_lane___m128i_u16(b, 4u64); let __tmp25 = self.saturate8(__tmp24); - self.set_lane___m128i_i8(dst, 12u64, __tmp25); - let __tmp26 = self.get_lane___m128i_i16(b, 5u64); + self.set_lane___m128i_u8(dst, 12u64, __tmp25); + let __tmp26 = self.get_lane___m128i_u16(b, 5u64); let __tmp27 = self.saturate8(__tmp26); - self.set_lane___m128i_i8(dst, 13u64, __tmp27); - let __tmp28 = self.get_lane___m128i_i16(b, 6u64); + self.set_lane___m128i_u8(dst, 13u64, __tmp27); + let __tmp28 = self.get_lane___m128i_u16(b, 6u64); let __tmp29 = self.saturate8(__tmp28); - self.set_lane___m128i_i8(dst, 14u64, __tmp29); - let __tmp30 = self.get_lane___m128i_i16(b, 7u64); + self.set_lane___m128i_u8(dst, 14u64, __tmp29); + let __tmp30 = self.get_lane___m128i_u16(b, 7u64); let __tmp31 = self.saturate8(__tmp30); - self.set_lane___m128i_i8(dst, 15u64, __tmp31); + self.set_lane___m128i_u8(dst, 15u64, __tmp31); } fn _mm_packs_epi32( &mut self, @@ -232,30 +137,30 @@ pub trait Intrinsics: super::Core { a: Self::__m128i, b: Self::__m128i, ) { - let __tmp0 = self.get_lane___m128i_i32(a, 0u64); + let __tmp0 = self.get_lane___m128i_u32(a, 0u64); let __tmp1 = self.saturate16(__tmp0); - self.set_lane___m128i_i16(dst, 0u64, __tmp1); - let __tmp2 = self.get_lane___m128i_i32(a, 1u64); + self.set_lane___m128i_u16(dst, 0u64, __tmp1); + let __tmp2 = self.get_lane___m128i_u32(a, 1u64); let __tmp3 = self.saturate16(__tmp2); - self.set_lane___m128i_i16(dst, 1u64, __tmp3); - let __tmp4 = self.get_lane___m128i_i32(a, 2u64); + self.set_lane___m128i_u16(dst, 1u64, __tmp3); + let __tmp4 = self.get_lane___m128i_u32(a, 2u64); let __tmp5 = self.saturate16(__tmp4); - self.set_lane___m128i_i16(dst, 2u64, __tmp5); - let __tmp6 = self.get_lane___m128i_i32(a, 3u64); + self.set_lane___m128i_u16(dst, 2u64, __tmp5); + let __tmp6 = self.get_lane___m128i_u32(a, 3u64); let __tmp7 = self.saturate16(__tmp6); - self.set_lane___m128i_i16(dst, 3u64, __tmp7); - let __tmp8 = self.get_lane___m128i_i32(b, 0u64); + self.set_lane___m128i_u16(dst, 3u64, __tmp7); + let __tmp8 = self.get_lane___m128i_u32(b, 0u64); let __tmp9 = self.saturate16(__tmp8); - self.set_lane___m128i_i16(dst, 4u64, __tmp9); - let __tmp10 = self.get_lane___m128i_i32(b, 1u64); + self.set_lane___m128i_u16(dst, 4u64, __tmp9); + let __tmp10 = self.get_lane___m128i_u32(b, 1u64); let __tmp11 = self.saturate16(__tmp10); - self.set_lane___m128i_i16(dst, 5u64, __tmp11); - let __tmp12 = self.get_lane___m128i_i32(b, 2u64); + self.set_lane___m128i_u16(dst, 5u64, __tmp11); + let __tmp12 = self.get_lane___m128i_u32(b, 2u64); let __tmp13 = self.saturate16(__tmp12); - self.set_lane___m128i_i16(dst, 6u64, __tmp13); - let __tmp14 = self.get_lane___m128i_i32(b, 3u64); + self.set_lane___m128i_u16(dst, 6u64, __tmp13); + let __tmp14 = self.get_lane___m128i_u32(b, 3u64); let __tmp15 = self.saturate16(__tmp14); - self.set_lane___m128i_i16(dst, 7u64, __tmp15); + self.set_lane___m128i_u16(dst, 7u64, __tmp15); } fn _mm_packus_epi16( &mut self, @@ -263,52 +168,52 @@ pub trait Intrinsics: super::Core { a: Self::__m128i, b: Self::__m128i, ) { - let __tmp0 = self.get_lane___m128i_i16(a, 0u64); + let __tmp0 = self.get_lane___m128i_u16(a, 0u64); let __tmp1 = self.saturate_u8(__tmp0); self.set_lane___m128i_u8(dst, 0u64, __tmp1); - let __tmp2 = self.get_lane___m128i_i16(a, 1u64); + let __tmp2 = self.get_lane___m128i_u16(a, 1u64); let __tmp3 = self.saturate_u8(__tmp2); self.set_lane___m128i_u8(dst, 1u64, __tmp3); - let __tmp4 = self.get_lane___m128i_i16(a, 2u64); + let __tmp4 = self.get_lane___m128i_u16(a, 2u64); let __tmp5 = self.saturate_u8(__tmp4); self.set_lane___m128i_u8(dst, 2u64, __tmp5); - let __tmp6 = self.get_lane___m128i_i16(a, 3u64); + let __tmp6 = self.get_lane___m128i_u16(a, 3u64); let __tmp7 = self.saturate_u8(__tmp6); self.set_lane___m128i_u8(dst, 3u64, __tmp7); - let __tmp8 = self.get_lane___m128i_i16(a, 4u64); + let __tmp8 = self.get_lane___m128i_u16(a, 4u64); let __tmp9 = self.saturate_u8(__tmp8); self.set_lane___m128i_u8(dst, 4u64, __tmp9); - let __tmp10 = self.get_lane___m128i_i16(a, 5u64); + let __tmp10 = self.get_lane___m128i_u16(a, 5u64); let __tmp11 = self.saturate_u8(__tmp10); self.set_lane___m128i_u8(dst, 5u64, __tmp11); - let __tmp12 = self.get_lane___m128i_i16(a, 6u64); + let __tmp12 = self.get_lane___m128i_u16(a, 6u64); let __tmp13 = self.saturate_u8(__tmp12); self.set_lane___m128i_u8(dst, 6u64, __tmp13); - let __tmp14 = self.get_lane___m128i_i16(a, 7u64); + let __tmp14 = self.get_lane___m128i_u16(a, 7u64); let __tmp15 = self.saturate_u8(__tmp14); self.set_lane___m128i_u8(dst, 7u64, __tmp15); - let __tmp16 = self.get_lane___m128i_i16(b, 0u64); + let __tmp16 = self.get_lane___m128i_u16(b, 0u64); let __tmp17 = self.saturate_u8(__tmp16); self.set_lane___m128i_u8(dst, 8u64, __tmp17); - let __tmp18 = self.get_lane___m128i_i16(b, 1u64); + let __tmp18 = self.get_lane___m128i_u16(b, 1u64); let __tmp19 = self.saturate_u8(__tmp18); self.set_lane___m128i_u8(dst, 9u64, __tmp19); - let __tmp20 = self.get_lane___m128i_i16(b, 2u64); + let __tmp20 = self.get_lane___m128i_u16(b, 2u64); let __tmp21 = self.saturate_u8(__tmp20); self.set_lane___m128i_u8(dst, 10u64, __tmp21); - let __tmp22 = self.get_lane___m128i_i16(b, 3u64); + let __tmp22 = self.get_lane___m128i_u16(b, 3u64); let __tmp23 = self.saturate_u8(__tmp22); self.set_lane___m128i_u8(dst, 11u64, __tmp23); - let __tmp24 = self.get_lane___m128i_i16(b, 4u64); + let __tmp24 = self.get_lane___m128i_u16(b, 4u64); let __tmp25 = self.saturate_u8(__tmp24); self.set_lane___m128i_u8(dst, 12u64, __tmp25); - let __tmp26 = self.get_lane___m128i_i16(b, 5u64); + let __tmp26 = self.get_lane___m128i_u16(b, 5u64); let __tmp27 = self.saturate_u8(__tmp26); self.set_lane___m128i_u8(dst, 13u64, __tmp27); - let __tmp28 = self.get_lane___m128i_i16(b, 6u64); + let __tmp28 = self.get_lane___m128i_u16(b, 6u64); let __tmp29 = self.saturate_u8(__tmp28); self.set_lane___m128i_u8(dst, 14u64, __tmp29); - let __tmp30 = self.get_lane___m128i_i16(b, 7u64); + let __tmp30 = self.get_lane___m128i_u16(b, 7u64); let __tmp31 = self.saturate_u8(__tmp30); self.set_lane___m128i_u8(dst, 15u64, __tmp31); } @@ -318,52 +223,52 @@ pub trait Intrinsics: super::Core { a: Self::__m128i, b: Self::__m128i, ) { - let __tmp0 = self.get_lane___m128i_i32(a, 0u64); + let __tmp0 = self.get_lane___m128i_u32(a, 0u64); let __tmp1 = self.saturate_u16(__tmp0); self.set_lane___m128i_u16(dst, 0u64, __tmp1); - let __tmp2 = self.get_lane___m128i_i32(a, 1u64); + let __tmp2 = self.get_lane___m128i_u32(a, 1u64); let __tmp3 = self.saturate_u16(__tmp2); self.set_lane___m128i_u16(dst, 1u64, __tmp3); - let __tmp4 = self.get_lane___m128i_i32(a, 2u64); + let __tmp4 = self.get_lane___m128i_u32(a, 2u64); let __tmp5 = self.saturate_u16(__tmp4); self.set_lane___m128i_u16(dst, 2u64, __tmp5); - let __tmp6 = self.get_lane___m128i_i32(a, 3u64); + let __tmp6 = self.get_lane___m128i_u32(a, 3u64); let __tmp7 = self.saturate_u16(__tmp6); self.set_lane___m128i_u16(dst, 3u64, __tmp7); - let __tmp8 = self.get_lane___m128i_i32(b, 0u64); + let __tmp8 = self.get_lane___m128i_u32(b, 0u64); let __tmp9 = self.saturate_u16(__tmp8); self.set_lane___m128i_u16(dst, 4u64, __tmp9); - let __tmp10 = self.get_lane___m128i_i32(b, 1u64); + let __tmp10 = self.get_lane___m128i_u32(b, 1u64); let __tmp11 = self.saturate_u16(__tmp10); self.set_lane___m128i_u16(dst, 5u64, __tmp11); - let __tmp12 = self.get_lane___m128i_i32(b, 2u64); + let __tmp12 = self.get_lane___m128i_u32(b, 2u64); let __tmp13 = self.saturate_u16(__tmp12); self.set_lane___m128i_u16(dst, 6u64, __tmp13); - let __tmp14 = self.get_lane___m128i_i32(b, 3u64); + let __tmp14 = self.get_lane___m128i_u32(b, 3u64); let __tmp15 = self.saturate_u16(__tmp14); self.set_lane___m128i_u16(dst, 7u64, __tmp15); } fn _mm_abs_epi8(&mut self, dst: &mut Self::__m128i, a: Self::__m128i) { for j in 0u64..=15u64 { let i = (j * 8u64); - let __tmp0 = self.get_lane___m128i_i8(a, (i / 8u64)); - let __tmp1 = self.abs_i8(__tmp0); + let __tmp0 = self.get_lane___m128i_u8(a, (i / 8u64)); + let __tmp1 = self.abs_u8(__tmp0); self.set_lane___m128i_u8(dst, (i / 8u64), __tmp1); } } fn _mm_abs_epi16(&mut self, dst: &mut Self::__m128i, a: Self::__m128i) { for j in 0u64..=7u64 { let i = (j * 16u64); - let __tmp0 = self.get_lane___m128i_i16(a, (i / 16u64)); - let __tmp1 = self.abs_i16(__tmp0); + let __tmp0 = self.get_lane___m128i_u16(a, (i / 16u64)); + let __tmp1 = self.abs_u16(__tmp0); self.set_lane___m128i_u16(dst, (i / 16u64), __tmp1); } } fn _mm_abs_epi32(&mut self, dst: &mut Self::__m128i, a: Self::__m128i) { for j in 0u64..=3u64 { let i = (j * 32u64); - let __tmp0 = self.get_lane___m128i_i32(a, (i / 32u64)); - let __tmp1 = self.abs_i32(__tmp0); + let __tmp0 = self.get_lane___m128i_u32(a, (i / 32u64)); + let __tmp1 = self.abs_u32(__tmp0); self.set_lane___m128i_u32(dst, (i / 32u64), __tmp1); } } @@ -371,34 +276,15 @@ pub trait Intrinsics: super::Core { pub mod soft_arch { pub use super::super::soft_arch_types::*; use super::Intrinsics; - pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i { - let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_add_epi16(&mut output, a, b); - output - } - pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i { - let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_add_epi32(&mut output, a, b); - output - } - pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i { - let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_add_epi64(&mut output, a, b); - output - } - pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i { - let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_adds_epi16(&mut output, a, b); - output - } pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_set_epi64x(&mut output, e1, e0); + super::super::ValueCore._mm_set_epi64x(&mut output, e1 as _, e0 as _); output } pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_setr_epi32(&mut output, e3, e2, e1, e0); + super::super::ValueCore + ._mm_setr_epi32(&mut output, e3 as _, e2 as _, e1 as _, e0 as _); output } pub fn _mm_setr_epi16( @@ -413,7 +299,17 @@ pub mod soft_arch { ) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; super::super::ValueCore - ._mm_setr_epi16(&mut output, e7, e6, e5, e4, e3, e2, e1, e0); + ._mm_setr_epi16( + &mut output, + e7 as _, + e6 as _, + e5 as _, + e4 as _, + e3 as _, + e2 as _, + e1 as _, + e0 as _, + ); output } pub fn _mm_setr_epi8( @@ -438,58 +334,58 @@ pub mod soft_arch { super::super::ValueCore ._mm_setr_epi8( &mut output, - e15, - e14, - e13, - e12, - e11, - e10, - e9, - e8, - e7, - e6, - e5, - e4, - e3, - e2, - e1, - e0, + e15 as _, + e14 as _, + e13 as _, + e12 as _, + e11 as _, + e10 as _, + e9 as _, + e8 as _, + e7 as _, + e6 as _, + e5 as _, + e4 as _, + e3 as _, + e2 as _, + e1 as _, + e0 as _, ); output } pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_packs_epi16(&mut output, a, b); + super::super::ValueCore._mm_packs_epi16(&mut output, a as _, b as _); output } pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_packs_epi32(&mut output, a, b); + super::super::ValueCore._mm_packs_epi32(&mut output, a as _, b as _); output } pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_packus_epi16(&mut output, a, b); + super::super::ValueCore._mm_packus_epi16(&mut output, a as _, b as _); output } pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_packus_epi32(&mut output, a, b); + super::super::ValueCore._mm_packus_epi32(&mut output, a as _, b as _); output } pub fn _mm_abs_epi8(a: __m128i) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_abs_epi8(&mut output, a); + super::super::ValueCore._mm_abs_epi8(&mut output, a as _); output } pub fn _mm_abs_epi16(a: __m128i) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_abs_epi16(&mut output, a); + super::super::ValueCore._mm_abs_epi16(&mut output, a as _); output } pub fn _mm_abs_epi32(a: __m128i) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; - super::super::ValueCore._mm_abs_epi32(&mut output, a); + super::super::ValueCore._mm_abs_epi32(&mut output, a as _); output } } @@ -497,128 +393,92 @@ pub mod soft_arch { pub mod tests { use super::super::compare_test_helper::hard_soft_same_128; #[test] - fn _mm_add_epi16() { - hard_soft_same_128! { - { let a = _mm_setr_epi16(- 24391i16, 19541i16, - 16509i16, 7733i16, - - 15140i16, 30719i16, 16513i16, 22878i16); let b = _mm_setr_epi16(23986i16, - 27900i16, - 8343i16, - 10648i16, 4841i16, 14610i16, - 17251i16, - 3971i16); - _mm_add_epi16(a, b) } - } - } - #[test] - fn _mm_add_epi32() { - hard_soft_same_128! { - { let a = _mm_setr_epi16(22390i16, - 23547i16, 15401i16, 15832i16, - - 14212i16, - 1286i16, - 18062i16, 22296i16); let b = _mm_setr_epi16(18077i16, - 23617i16, - 9205i16, 21233i16, - 4332i16, - 31339i16, 23623i16, - 22080i16); - _mm_add_epi32(a, b) } - } - } - #[test] - fn _mm_add_epi64() { - hard_soft_same_128! { - { let a = _mm_setr_epi16(- 1436i16, - 30227i16, 8629i16, 10922i16, - - 16731i16, - 1013i16, - 14310i16, 2892i16); let b = _mm_setr_epi16(- 28568i16, - 12614i16, 20103i16, 32412i16, - 28704i16, - 27930i16, 4197i16, 1829i16); - _mm_add_epi64(a, b) } - } - } - #[test] - fn _mm_adds_epi16() { - hard_soft_same_128! { - { let a = _mm_setr_epi16(9149i16, 18759i16, 30885i16, - 3879i16, 21600i16, - 24454i16, 23524i16, 10765i16); let b = _mm_setr_epi16(32539i16, 26890i16, - - 3892i16, 4386i16, 18704i16, 8253i16, - 29217i16, 32013i16); _mm_adds_epi16(a, - b) } - } - } - #[test] fn _mm_set_epi64x() { hard_soft_same_128! { - { let e1 = - 589376611403916251i64; let e0 = 3902096933100612535i64; + { let e1 = 1041352657357235268i64; let e0 = 1955209120357942897i64; _mm_set_epi64x(e1, e0) } } } #[test] fn _mm_setr_epi32() { hard_soft_same_128! { - { let e3 = 1973077588i32; let e2 = 650443732i32; let e1 = - 2133091191i32; - let e0 = - 352824609i32; _mm_setr_epi32(e3, e2, e1, e0) } + { let e3 = 1455669123i32; let e2 = 247864885i32; let e1 = 1390920924i32; let + e0 = 1068333055i32; _mm_setr_epi32(e3, e2, e1, e0) } } } #[test] fn _mm_setr_epi16() { hard_soft_same_128! { - { let e7 = - 31392i16; let e6 = - 14015i16; let e5 = - 32565i16; let e4 = - - 11312i16; let e3 = - 4934i16; let e2 = - 19283i16; let e1 = - 27533i16; let - e0 = - 9939i16; _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) } + { let e7 = 16513i16; let e6 = 22878i16; let e5 = 23986i16; let e4 = 27900i16; + let e3 = - 8343i16; let e2 = - 10648i16; let e1 = 4841i16; let e0 = 14610i16; + _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) } } } #[test] fn _mm_setr_epi8() { hard_soft_same_128! { - { let e15 = - 46i8; let e14 = - 46i8; let e13 = - 125i8; let e12 = 81i8; let - e11 = - 56i8; let e10 = - 75i8; let e9 = 54i8; let e8 = 109i8; let e7 = 29i8; - let e6 = 41i8; let e5 = - 21i8; let e4 = 39i8; let e3 = 89i8; let e2 = - - 36i8; let e1 = - 88i8; let e0 = 11i8; _mm_setr_epi8(e15, e14, e13, e12, e11, - e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) } + { let e15 = - 99i8; let e14 = 125i8; let e13 = 118i8; let e12 = 5i8; let e11 + = 41i8; let e10 = - 40i8; let e9 = 124i8; let e8 = - 6i8; let e7 = 114i8; let + e6 = 24i8; let e5 = - 99i8; let e4 = 65i8; let e3 = 11i8; let e2 = - 15i8; + let e1 = 20i8; let e0 = - 107i8; _mm_setr_epi8(e15, e14, e13, e12, e11, e10, + e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) } } } #[test] fn _mm_packs_epi16() { hard_soft_same_128! { - { let a = _mm_setr_epi16(6572i16, - 54i16, 10431i16, - 4614i16, - 1911i16, - 17046i16, - 12772i16, - 28109i16); let b = _mm_setr_epi16(7409i16, - - 30136i16, - 28607i16, - 1975i16, 23451i16, - 32657i16, - 28920i16, - - 2519i16); _mm_packs_epi16(a, b) } + { let a = _mm_setr_epi16(23623i16, - 22080i16, - 1436i16, - 30227i16, + 8629i16, 10922i16, - 16731i16, - 1013i16); let b = _mm_setr_epi16(- 14310i16, + 2892i16, - 28568i16, 12614i16, 20103i16, 32412i16, - 28704i16, - 27930i16); + _mm_packs_epi16(a, b) } } } #[test] fn _mm_packs_epi32() { hard_soft_same_128! { - { let a = _mm_setr_epi16(- 7284i16, 7023i16, - 31688i16, 4770i16, 28846i16, - - 13549i16, 13781i16, - 10474i16); let b = _mm_setr_epi16(12050i16, - 782i16, - 8840i16, 8344i16, 9169i16, 303i16, - 6879i16, - 28778i16); _mm_packs_epi32(a, - b) } + { let a = _mm_setr_epi16(4197i16, 1829i16, 9149i16, 18759i16, 30885i16, - + 3879i16, 21600i16, 24454i16); let b = _mm_setr_epi16(23524i16, 10765i16, + 32539i16, 26890i16, - 3892i16, 4386i16, 18704i16, 8253i16); + _mm_packs_epi32(a, b) } } } #[test] fn _mm_packus_epi16() { hard_soft_same_128! { - { let a = _mm_setr_epi16(- 11301i16, 10802i16, 18689i16, 12867i16, 18892i16, - 20484i16, - 4754i16, - 28358i16); let b = _mm_setr_epi16(27422i16, - - 14791i16, - 32685i16, - 4504i16, - 19709i16, 1090i16, 1898i16, 11224i16); - _mm_packus_epi16(a, b) } + { let a = _mm_setr_epi16(- 29217i16, 32013i16, 7448i16, 2172i16, - 14764i16, + - 1068i16, - 25463i16, 21215i16); let b = _mm_setr_epi16(- 31392i16, - + 14015i16, - 32565i16, - 11312i16, - 4934i16, - 19283i16, - 27533i16, - + 9939i16); _mm_packus_epi16(a, b) } } } #[test] fn _mm_packus_epi32() { hard_soft_same_128! { - { let a = _mm_setr_epi16(27569i16, 26879i16, 11743i16, 1055i16, 5327i16, - - 1490i16, - 6436i16, 1056i16); let b = _mm_setr_epi16(- 16744i16, 28829i16, - 23772i16, - 31202i16, 9764i16, 16146i16, 29119i16, 1909i16); + { let a = _mm_setr_epi16(- 9518i16, - 29742i16, 10115i16, 1617i16, 13256i16, + - 2379i16, 19254i16, 7533i16); let b = _mm_setr_epi16(- 17891i16, 30761i16, + 2539i16, 4135i16, 26713i16, 16348i16, - 21336i16, 3595i16); _mm_packus_epi32(a, b) } } } #[test] fn _mm_abs_epi8() { hard_soft_same_128! { - { let a = _mm_setr_epi16(- 4803i16, - 23533i16, - 22862i16, - 25389i16, - - 16117i16, - 21476i16, 30010i16, - 15743i16); _mm_abs_epi8(a) } + { let a = _mm_setr_epi16(6572i16, - 54i16, 10431i16, - 4614i16, - 1911i16, + 17046i16, - 12772i16, - 28109i16); _mm_abs_epi8(a) } } } #[test] fn _mm_abs_epi16() { hard_soft_same_128! { - { let a = _mm_setr_epi16(- 20689i16, - 11653i16, 22142i16, - 16597i16, - 28514i16, - 15735i16, - 6977i16, - 5493i16); _mm_abs_epi16(a) } + { let a = _mm_setr_epi16(7409i16, - 30136i16, - 28607i16, - 1975i16, + 23451i16, - 32657i16, - 28920i16, - 2519i16); _mm_abs_epi16(a) } } } #[test] fn _mm_abs_epi32() { hard_soft_same_128! { - { let a = _mm_setr_epi16(17059i16, 15712i16, 32305i16, - 23877i16, 29411i16, - - 3868i16, - 10128i16, 25298i16); _mm_abs_epi32(a) } + { let a = _mm_setr_epi16(- 7284i16, 7023i16, - 31688i16, 4770i16, 28846i16, - + 13549i16, 13781i16, - 10474i16); _mm_abs_epi32(a) } } } } diff --git a/crates/intringen/src/x86/mod.rs b/crates/intringen/src/x86/mod.rs index c703a8f..4743ce1 100644 --- a/crates/intringen/src/x86/mod.rs +++ b/crates/intringen/src/x86/mod.rs @@ -10,41 +10,28 @@ pub trait Core { type u32: Copy; type u64: Copy; - type i8: Copy; - type i16: Copy; - type i32: Copy; - type i64: Copy; - type __m128i: Copy; - fn cast_sign_i8_u8(&mut self, value: Self::i8) -> Self::u8; - fn cast_sign_i16_u16(&mut self, value: Self::i16) -> Self::u16; - fn cast_sign_i32_u32(&mut self, value: Self::i32) -> Self::u32; - fn cast_sign_i64_u64(&mut self, value: Self::i64) -> Self::u64; - fn get_lane___m128i_u8(&mut self, value: Self::__m128i, idx: u64) -> Self::u8; - fn get_lane___m128i_i8(&mut self, value: Self::__m128i, idx: u64) -> Self::i8; fn get_lane___m128i_u16(&mut self, value: Self::__m128i, idx: u64) -> Self::u16; - fn get_lane___m128i_i16(&mut self, value: Self::__m128i, idx: u64) -> Self::i16; fn get_lane___m128i_u32(&mut self, value: Self::__m128i, idx: u64) -> Self::u32; - fn get_lane___m128i_i32(&mut self, value: Self::__m128i, idx: u64) -> Self::i32; fn get_lane___m128i_u64(&mut self, value: Self::__m128i, idx: u64) -> Self::u64; - fn get_lane___m128i_i64(&mut self, value: Self::__m128i, idx: u64) -> Self::i64; fn set_lane___m128i_u8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u8); - fn set_lane___m128i_i8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i8); fn set_lane___m128i_u16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u16); - fn set_lane___m128i_i16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i16); fn set_lane___m128i_u32(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u32); - fn set_lane___m128i_i32(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i32); fn set_lane___m128i_u64(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u64); - fn set_lane___m128i_i64(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i64); - fn saturate8(&mut self, elem: Self::i16) -> Self::i8; - fn saturate_u8(&mut self, elem: Self::i16) -> Self::u8; - fn saturate16(&mut self, elem: Self::i32) -> Self::i16; - fn saturate_u16(&mut self, elem: Self::i32) -> Self::u16; - fn add_u64(&mut self, lhs: Self::u64, rhs: Self::u64) -> Self::u64; + fn saturate8(&mut self, elem: Self::u16) -> Self::u8; + fn saturate_u8(&mut self, elem: Self::u16) -> Self::u8; + fn saturate16(&mut self, elem: Self::u32) -> Self::u16; + fn saturate_u16(&mut self, elem: Self::u32) -> Self::u16; + fn add_64(&mut self, lhs: Self::u64, rhs: Self::u64) -> Self::u64; + + fn abs_u8(&mut self, x: Self::u8) -> Self::u8; + fn abs_u16(&mut self, x: Self::u16) -> Self::u16; + fn abs_u32(&mut self, x: Self::u32) -> Self::u32; + fn abs_u64(&mut self, x: Self::u64) -> Self::u64; } pub struct ValueCore; @@ -55,40 +42,13 @@ impl Core for ValueCore { type u32 = u32; type u64 = u64; - type i8 = i8; - type i16 = i16; - type i32 = i32; - type i64 = i64; - type __m128i = [u8; 16]; - ////// CAST - - fn cast_sign_i8_u8(&mut self, value: Self::i8) -> Self::u8 { - value as _ - } - - fn cast_sign_i16_u16(&mut self, value: Self::i16) -> Self::u16 { - value as _ - } - - fn cast_sign_i32_u32(&mut self, value: Self::i32) -> Self::u32 { - value as _ - } - - fn cast_sign_i64_u64(&mut self, value: Self::i64) -> Self::u64 { - value as _ - } - ////// GET LANE fn get_lane___m128i_u8(&mut self, value: Self::__m128i, idx: u64) -> Self::u8 { value[idx as usize] } - fn get_lane___m128i_i8(&mut self, value: Self::__m128i, idx: u64) -> Self::i8 { - self.get_lane___m128i_u8(value, idx) as i8 - } - fn get_lane___m128i_u16(&mut self, value: Self::__m128i, idx: u64) -> Self::u16 { let mut acc = 0; for i in 0..2 { @@ -99,10 +59,6 @@ impl Core for ValueCore { acc } - fn get_lane___m128i_i16(&mut self, value: Self::__m128i, idx: u64) -> Self::i16 { - self.get_lane___m128i_u16(value, idx) as i16 - } - fn get_lane___m128i_u32(&mut self, value: Self::__m128i, idx: u64) -> Self::u32 { let mut acc = 0; for i in 0..4 { @@ -113,10 +69,6 @@ impl Core for ValueCore { acc } - fn get_lane___m128i_i32(&mut self, value: Self::__m128i, idx: u64) -> Self::i32 { - self.get_lane___m128i_u32(value, idx) as i32 - } - fn get_lane___m128i_u64(&mut self, value: Self::__m128i, idx: u64) -> Self::u64 { let mut acc = 0; for i in 0..8 { @@ -127,20 +79,12 @@ impl Core for ValueCore { acc } - fn get_lane___m128i_i64(&mut self, value: Self::__m128i, idx: u64) -> Self::i64 { - self.get_lane___m128i_u64(value, idx) as i64 - } - ////// SET LANE fn set_lane___m128i_u8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u8) { place[idx as usize] = value; } - fn set_lane___m128i_i8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i8) { - self.set_lane___m128i_u8(place, idx, value as u8); - } - fn set_lane___m128i_u16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u16) { for i in 0..2 { let value = ((value >> 8 * i) & 0xFF) as u8; @@ -148,10 +92,6 @@ impl Core for ValueCore { } } - fn set_lane___m128i_i16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i16) { - self.set_lane___m128i_u16(place, idx, value as u16); - } - fn set_lane___m128i_u32(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u32) { for i in 0..4 { let value = ((value >> 8 * i) & 0xFF) as u8; @@ -159,10 +99,6 @@ impl Core for ValueCore { } } - fn set_lane___m128i_i32(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i32) { - self.set_lane___m128i_u32(place, idx, value as u32); - } - fn set_lane___m128i_u64(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u64) { for i in 0..8 { let value = ((value >> 8 * i) & 0xFF) as u8; @@ -170,33 +106,45 @@ impl Core for ValueCore { } } - fn set_lane___m128i_i64(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i64) { - self.set_lane___m128i_u32(place, idx, value as u32); - } - ////// HELPERS - fn saturate8(&mut self, elem: Self::i16) -> Self::i8 { - let clamp = elem.clamp(i8::MIN as i16, i8::MAX as i16); - clamp as i8 + fn saturate8(&mut self, elem: Self::u16) -> Self::u8 { + let clamp = (elem as i16).clamp(i8::MIN as i16, i8::MAX as i16); + clamp as i8 as u8 } - fn saturate_u8(&mut self, elem: Self::i16) -> Self::u8 { - let clamp = elem.clamp(0, u8::MAX as i16); + fn saturate_u8(&mut self, elem: Self::u16) -> Self::u8 { + let clamp = (elem as i16).clamp(0, u8::MAX as i16); clamp as u8 } - fn saturate16(&mut self, elem: Self::i32) -> Self::i16 { - let clamp = elem.clamp(i16::MIN as i32, i16::MAX as i32); - clamp as i16 + fn saturate16(&mut self, elem: Self::u32) -> Self::u16 { + let clamp = (elem as i32).clamp(i16::MIN as i32, i16::MAX as i32); + clamp as i16 as u16 } - fn saturate_u16(&mut self, elem: Self::i32) -> Self::u16 { - let clamp = elem.clamp(0, u16::MAX as i32); + fn saturate_u16(&mut self, elem: Self::u32) -> Self::u16 { + let clamp = (elem as i32).clamp(0, u16::MAX as i32); clamp as u16 } - fn add_u64(&mut self, lhs: Self::u64, rhs: Self::u64) -> Self::u64 { + fn add_64(&mut self, lhs: Self::u64, rhs: Self::u64) -> Self::u64 { lhs.wrapping_add(rhs) } + + fn abs_u8(&mut self, x: Self::u8) -> Self::u8 { + (x as i8).abs() as u8 + } + + fn abs_u16(&mut self, x: Self::u16) -> Self::u16 { + (x as i16).abs() as u16 + } + + fn abs_u32(&mut self, x: Self::u32) -> Self::u32 { + (x as i32).abs() as u32 + } + + fn abs_u64(&mut self, x: Self::u64) -> Self::u64 { + (x as i64).abs() as u64 + } } mod soft_arch_types {