diff --git a/crates/generate/src/generate.rs b/crates/generate/src/generate.rs index d5b3ab7..a9c0e97 100644 --- a/crates/generate/src/generate.rs +++ b/crates/generate/src/generate.rs @@ -154,21 +154,24 @@ fn generate_body_test(intr: &Intrinsic, rng: &mut SmallRng) -> Result Result { - let quotei16 = |n| { + fn quote(n: impl quote::ToTokens) -> syn::Expr { syn::parse_quote! { #n } - }; + } Ok(match ty { - "i16" => quotei16(rng.gen::()), + "i8" => quote(rng.gen::()), + "i16" => quote(rng.gen::()), + "i32" => quote(rng.gen::()), + "i64" => quote(rng.gen::()), "__m128i" => { let args = [ - quotei16(rng.gen::()), - quotei16(rng.gen::()), - quotei16(rng.gen::()), - quotei16(rng.gen::()), - quotei16(rng.gen::()), - quotei16(rng.gen::()), - quotei16(rng.gen::()), - quotei16(rng.gen::()), + quote(rng.gen::()), + quote(rng.gen::()), + quote(rng.gen::()), + quote(rng.gen::()), + quote(rng.gen::()), + quote(rng.gen::()), + quote(rng.gen::()), + quote(rng.gen::()), ]; syn::parse_quote! { @@ -192,7 +195,10 @@ impl VariableType { fn of(etype: &str, ty: &str) -> Result { let (rawtype_signed, full_width) = match map_type_to_rust(ty) { "__m128i" => (false, 128), + "i8" => (true, 8), "i16" => (true, 16), + "i32" => (true, 32), + "i64" => (true, 64), _ => bail!("unknown type: {ty}"), }; let (is_signed, elem_width) = match etype { @@ -201,6 +207,8 @@ impl VariableType { "SI32" => (true, 32), "UI8" => (false, 8), "UI16" => (false, 16), + "UI32" => (false, 32), + "UI64" => (false, 64), _ => bail!("unknown element type: {etype}"), }; Ok(Self { @@ -432,7 +440,11 @@ fn signature_soft_arch(intr: &Intrinsic, body: syn::Block) -> Result &str { match ty { + "__m128i" => ty, + "char" => "i8", "short" => "i16", - ty => ty, + "int" => "i32", + "__int64" => "i64", + ty => panic!("unknown type: {ty}"), } } diff --git a/crates/generate/src/main.rs b/crates/generate/src/main.rs index 0d4c013..ffe78be 100644 --- a/crates/generate/src/main.rs +++ b/crates/generate/src/main.rs @@ -89,9 +89,13 @@ fn main() -> Result<()> { } const INTRINSICS_GENERATE: &[&str] = &[ + "_mm_setr_epi8", + "_mm_setr_epi16", + "_mm_setr_epi32", + "_mm_set_epi64x", + // packing instructions "_mm_packus_epi16", "_mm_packs_epi16", "_mm_packus_epi32", "_mm_packs_epi32", - "_mm_setr_epi16", ]; diff --git a/crates/intringen/src/x86/generated.rs b/crates/intringen/src/x86/generated.rs index 3ea2658..89138e4 100644 --- a/crates/intringen/src/x86/generated.rs +++ b/crates/intringen/src/x86/generated.rs @@ -1,5 +1,28 @@ impl Intrinsics for C {} pub trait Intrinsics: super::Core { + fn _mm_set_epi64x(&mut self, dst: &mut Self::__m128i, e1: Self::i64, e0: Self::i64) { + let __tmp = self.cast_sign_i64_u64(e0); + self.set_lane___m128i_u64(dst, 0u64, __tmp); + let __tmp = self.cast_sign_i64_u64(e1); + self.set_lane___m128i_u64(dst, 1u64, __tmp); + } + fn _mm_setr_epi32( + &mut self, + dst: &mut Self::__m128i, + e3: Self::i32, + e2: Self::i32, + e1: Self::i32, + e0: Self::i32, + ) { + let __tmp = self.cast_sign_i32_u32(e3); + self.set_lane___m128i_u32(dst, 0u64, __tmp); + let __tmp = self.cast_sign_i32_u32(e2); + self.set_lane___m128i_u32(dst, 1u64, __tmp); + let __tmp = self.cast_sign_i32_u32(e1); + self.set_lane___m128i_u32(dst, 2u64, __tmp); + let __tmp = self.cast_sign_i32_u32(e0); + self.set_lane___m128i_u32(dst, 3u64, __tmp); + } fn _mm_setr_epi16( &mut self, dst: &mut Self::__m128i, @@ -29,6 +52,59 @@ pub trait Intrinsics: super::Core { let __tmp = self.cast_sign_i16_u16(e0); self.set_lane___m128i_u16(dst, 7u64, __tmp); } + fn _mm_setr_epi8( + &mut self, + dst: &mut Self::__m128i, + e15: Self::i8, + e14: Self::i8, + e13: Self::i8, + e12: Self::i8, + e11: Self::i8, + e10: Self::i8, + e9: Self::i8, + e8: Self::i8, + e7: Self::i8, + e6: Self::i8, + e5: Self::i8, + e4: Self::i8, + e3: Self::i8, + e2: Self::i8, + e1: Self::i8, + e0: Self::i8, + ) { + let __tmp = self.cast_sign_i8_u8(e15); + self.set_lane___m128i_u8(dst, 0u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e14); + self.set_lane___m128i_u8(dst, 1u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e13); + self.set_lane___m128i_u8(dst, 2u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e12); + self.set_lane___m128i_u8(dst, 3u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e11); + self.set_lane___m128i_u8(dst, 4u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e10); + self.set_lane___m128i_u8(dst, 5u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e9); + self.set_lane___m128i_u8(dst, 6u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e8); + self.set_lane___m128i_u8(dst, 7u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e7); + self.set_lane___m128i_u8(dst, 8u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e6); + self.set_lane___m128i_u8(dst, 9u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e5); + self.set_lane___m128i_u8(dst, 10u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e4); + self.set_lane___m128i_u8(dst, 11u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e3); + self.set_lane___m128i_u8(dst, 12u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e2); + self.set_lane___m128i_u8(dst, 13u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e1); + self.set_lane___m128i_u8(dst, 14u64, __tmp); + let __tmp = self.cast_sign_i8_u8(e0); + self.set_lane___m128i_u8(dst, 15u64, __tmp); + } fn _mm_packs_epi16( &mut self, dst: &mut Self::__m128i, @@ -205,6 +281,16 @@ pub trait Intrinsics: super::Core { pub mod soft_arch { pub use super::super::soft_arch_types::*; use super::Intrinsics; + pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i { + let mut output = unsafe { std::mem::zeroed() }; + super::super::ValueCore._mm_set_epi64x(&mut output, e1, e0); + output + } + pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i { + let mut output = unsafe { std::mem::zeroed() }; + super::super::ValueCore._mm_setr_epi32(&mut output, e3, e2, e1, e0); + output + } pub fn _mm_setr_epi16( e7: i16, e6: i16, @@ -220,6 +306,47 @@ pub mod soft_arch { ._mm_setr_epi16(&mut output, e7, e6, e5, e4, e3, e2, e1, e0); output } + pub fn _mm_setr_epi8( + e15: i8, + e14: i8, + e13: i8, + e12: i8, + e11: i8, + e10: i8, + e9: i8, + e8: i8, + e7: i8, + e6: i8, + e5: i8, + e4: i8, + e3: i8, + e2: i8, + e1: i8, + e0: i8, + ) -> __m128i { + let mut output = unsafe { std::mem::zeroed() }; + super::super::ValueCore + ._mm_setr_epi8( + &mut output, + e15, + e14, + e13, + e12, + e11, + e10, + e9, + e8, + e7, + e6, + e5, + e4, + e3, + e2, + e1, + e0, + ); + output + } pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; super::super::ValueCore._mm_packs_epi16(&mut output, a, b); @@ -245,46 +372,70 @@ pub mod soft_arch { pub mod tests { use super::super::compare_test_helper::hard_soft_same_128; #[test] + fn _mm_set_epi64x() { + hard_soft_same_128! { + { let e1 = 1041352657357235268i64; let e0 = 1955209120357942897i64; + _mm_set_epi64x(e1, e0) } + } + } + #[test] + fn _mm_setr_epi32() { + hard_soft_same_128! { + { let e3 = 1455669123i32; let e2 = 247864885i32; let e1 = 1390920924i32; let + e0 = 1068333055i32; _mm_setr_epi32(e3, e2, e1, e0) } + } + } + #[test] fn _mm_setr_epi16() { hard_soft_same_128! { - { let e7 = - 24391i16; let e6 = 19541i16; let e5 = - 16509i16; let e4 = - 7733i16; let e3 = - 15140i16; let e2 = 30719i16; let e1 = 16513i16; let e0 = - 22878i16; _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) } + { let e7 = 16513i16; let e6 = 22878i16; let e5 = 23986i16; let e4 = 27900i16; + let e3 = - 8343i16; let e2 = - 10648i16; let e1 = 4841i16; let e0 = 14610i16; + _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) } + } + } + #[test] + fn _mm_setr_epi8() { + hard_soft_same_128! { + { let e15 = - 99i8; let e14 = 125i8; let e13 = 118i8; let e12 = 5i8; let e11 + = 41i8; let e10 = - 40i8; let e9 = 124i8; let e8 = - 6i8; let e7 = 114i8; let + e6 = 24i8; let e5 = - 99i8; let e4 = 65i8; let e3 = 11i8; let e2 = - 15i8; + let e1 = 20i8; let e0 = - 107i8; _mm_setr_epi8(e15, e14, e13, e12, e11, e10, + e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) } } } #[test] fn _mm_packs_epi16() { hard_soft_same_128! { - { let a = _mm_setr_epi16(23986i16, 27900i16, - 8343i16, - 10648i16, 4841i16, - 14610i16, - 17251i16, - 3971i16); let b = _mm_setr_epi16(22390i16, - - 23547i16, 15401i16, 15832i16, - 14212i16, - 1286i16, - 18062i16, 22296i16); + { let a = _mm_setr_epi16(23623i16, - 22080i16, - 1436i16, - 30227i16, + 8629i16, 10922i16, - 16731i16, - 1013i16); let b = _mm_setr_epi16(- 14310i16, + 2892i16, - 28568i16, 12614i16, 20103i16, 32412i16, - 28704i16, - 27930i16); _mm_packs_epi16(a, b) } } } #[test] fn _mm_packs_epi32() { hard_soft_same_128! { - { let a = _mm_setr_epi16(18077i16, 23617i16, - 9205i16, 21233i16, - 4332i16, - - 31339i16, 23623i16, - 22080i16); let b = _mm_setr_epi16(- 1436i16, - - 30227i16, 8629i16, 10922i16, - 16731i16, - 1013i16, - 14310i16, 2892i16); + { let a = _mm_setr_epi16(4197i16, 1829i16, 9149i16, 18759i16, 30885i16, - + 3879i16, 21600i16, 24454i16); let b = _mm_setr_epi16(23524i16, 10765i16, + 32539i16, 26890i16, - 3892i16, 4386i16, 18704i16, 8253i16); _mm_packs_epi32(a, b) } } } #[test] fn _mm_packus_epi16() { hard_soft_same_128! { - { let a = _mm_setr_epi16(- 28568i16, 12614i16, 20103i16, 32412i16, - - 28704i16, - 27930i16, 4197i16, 1829i16); let b = _mm_setr_epi16(9149i16, - 18759i16, 30885i16, - 3879i16, 21600i16, 24454i16, 23524i16, 10765i16); - _mm_packus_epi16(a, b) } + { let a = _mm_setr_epi16(- 29217i16, 32013i16, 7448i16, 2172i16, - 14764i16, + - 1068i16, - 25463i16, 21215i16); let b = _mm_setr_epi16(- 31392i16, - + 14015i16, - 32565i16, - 11312i16, - 4934i16, - 19283i16, - 27533i16, - + 9939i16); _mm_packus_epi16(a, b) } } } #[test] fn _mm_packus_epi32() { hard_soft_same_128! { - { let a = _mm_setr_epi16(32539i16, 26890i16, - 3892i16, 4386i16, 18704i16, - 8253i16, - 29217i16, 32013i16); let b = _mm_setr_epi16(7448i16, 2172i16, - - 14764i16, - 1068i16, - 25463i16, 21215i16, - 31392i16, - 14015i16); + { let a = _mm_setr_epi16(- 9518i16, - 29742i16, 10115i16, 1617i16, 13256i16, + - 2379i16, 19254i16, 7533i16); let b = _mm_setr_epi16(- 17891i16, 30761i16, + 2539i16, 4135i16, 26713i16, 16348i16, - 21336i16, 3595i16); _mm_packus_epi32(a, b) } } } diff --git a/crates/intringen/src/x86/mod.rs b/crates/intringen/src/x86/mod.rs index a8e28b8..ff95545 100644 --- a/crates/intringen/src/x86/mod.rs +++ b/crates/intringen/src/x86/mod.rs @@ -17,17 +17,28 @@ pub trait Core { type __m128i: Copy; + fn cast_sign_i8_u8(&mut self, value: Self::i8) -> Self::u8; fn cast_sign_i16_u16(&mut self, value: Self::i16) -> Self::u16; + fn cast_sign_i32_u32(&mut self, value: Self::i32) -> Self::u32; + fn cast_sign_i64_u64(&mut self, value: Self::i64) -> Self::u64; + fn get_lane___m128i_u8(&mut self, value: Self::__m128i, idx: u64) -> Self::u8; + fn get_lane___m128i_i8(&mut self, value: Self::__m128i, idx: u64) -> Self::i8; fn get_lane___m128i_u16(&mut self, value: Self::__m128i, idx: u64) -> Self::u16; fn get_lane___m128i_i16(&mut self, value: Self::__m128i, idx: u64) -> Self::i16; fn get_lane___m128i_u32(&mut self, value: Self::__m128i, idx: u64) -> Self::u32; fn get_lane___m128i_i32(&mut self, value: Self::__m128i, idx: u64) -> Self::i32; + fn get_lane___m128i_u64(&mut self, value: Self::__m128i, idx: u64) -> Self::u64; + fn get_lane___m128i_i64(&mut self, value: Self::__m128i, idx: u64) -> Self::i64; fn set_lane___m128i_u8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u8); fn set_lane___m128i_i8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i8); fn set_lane___m128i_u16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u16); fn set_lane___m128i_i16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i16); + fn set_lane___m128i_u32(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u32); + fn set_lane___m128i_i32(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i32); + fn set_lane___m128i_u64(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u64); + fn set_lane___m128i_i64(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i64); fn saturate8(&mut self, elem: Self::i16) -> Self::i8; fn saturate_u8(&mut self, elem: Self::i16) -> Self::u8; @@ -50,15 +61,42 @@ impl Core for ValueCore { type __m128i = [u8; 16]; + ////// CAST + + fn cast_sign_i8_u8(&mut self, value: Self::i8) -> Self::u8 { + value as _ + } + fn cast_sign_i16_u16(&mut self, value: Self::i16) -> Self::u16 { value as _ } - fn get_lane___m128i_u16(&mut self, value: Self::__m128i, idx: u64) -> Self::u16 { - let first = value[(idx * 2 + 1) as usize]; - let second = value[(idx * 2) as usize]; + fn cast_sign_i32_u32(&mut self, value: Self::i32) -> Self::u32 { + value as _ + } - ((first as u16) << 8) | (second as u16) + fn cast_sign_i64_u64(&mut self, value: Self::i64) -> Self::u64 { + value as _ + } + + ////// GET LANE + + fn get_lane___m128i_u8(&mut self, value: Self::__m128i, idx: u64) -> Self::u8 { + value[idx as usize] + } + + fn get_lane___m128i_i8(&mut self, value: Self::__m128i, idx: u64) -> Self::i8 { + self.get_lane___m128i_u8(value, idx) as i8 + } + + fn get_lane___m128i_u16(&mut self, value: Self::__m128i, idx: u64) -> Self::u16 { + let mut acc = 0; + for i in 0..2 { + let v = value[(idx * 2 + i) as usize]; + acc |= (v as u16) << (8 * i); + } + + acc } fn get_lane___m128i_i16(&mut self, value: Self::__m128i, idx: u64) -> Self::i16 { @@ -66,18 +104,35 @@ impl Core for ValueCore { } fn get_lane___m128i_u32(&mut self, value: Self::__m128i, idx: u64) -> Self::u32 { - let first = value[(idx * 4 + 3) as usize]; - let second = value[(idx * 4 + 2) as usize]; - let third = value[(idx * 4 + 1) as usize]; - let fourth = value[(idx * 4) as usize]; + let mut acc = 0; + for i in 0..4 { + let v = value[(idx * 4 + i) as usize]; + acc |= (v as u32) << (8 * i); + } - ((first as u32) << 24) | ((second as u32) << 16) | ((third as u32) << 8) | (fourth as u32) + acc } fn get_lane___m128i_i32(&mut self, value: Self::__m128i, idx: u64) -> Self::i32 { self.get_lane___m128i_u32(value, idx) as i32 } + fn get_lane___m128i_u64(&mut self, value: Self::__m128i, idx: u64) -> Self::u64 { + let mut acc = 0; + for i in 0..8 { + let v = value[(idx * 8 + i) as usize]; + acc |= (v as u64) << (8 * i); + } + + acc + } + + fn get_lane___m128i_i64(&mut self, value: Self::__m128i, idx: u64) -> Self::i64 { + self.get_lane___m128i_u64(value, idx) as i64 + } + + ////// SET LANE + fn set_lane___m128i_u8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u8) { place[idx as usize] = value; } @@ -87,16 +142,40 @@ impl Core for ValueCore { } fn set_lane___m128i_u16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u16) { - let first = (value & 0xFF) as u8; - let second = (value >> 8) as u8; - place[(idx * 2) as usize] = first; - place[(idx * 2 + 1) as usize] = second; + for i in 0..2 { + let value = ((value >> 8 * i) & 0xFF) as u8; + place[(idx * 2 + i) as usize] = value; + } } fn set_lane___m128i_i16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i16) { self.set_lane___m128i_u16(place, idx, value as u16); } + fn set_lane___m128i_u32(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u32) { + for i in 0..4 { + let value = ((value >> 8 * i) & 0xFF) as u8; + place[(idx * 4 + i) as usize] = value; + } + } + + fn set_lane___m128i_i32(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i32) { + self.set_lane___m128i_u32(place, idx, value as u32); + } + + fn set_lane___m128i_u64(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u64) { + for i in 0..8 { + let value = ((value >> 8 * i) & 0xFF) as u8; + place[(idx * 8 + i) as usize] = value; + } + } + + fn set_lane___m128i_i64(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i64) { + self.set_lane___m128i_u32(place, idx, value as u32); + } + + ////// HELPERS + fn saturate8(&mut self, elem: Self::i16) -> Self::i8 { let clamp = elem.clamp(i8::MIN as i16, i8::MAX as i16); clamp as i8