From 9e4ce7913ac4b0bba2f5fd0ac8b9db953d860888 Mon Sep 17 00:00:00 2001 From: Nilstrieb <48135649+Nilstrieb@users.noreply.github.com> Date: Sat, 6 Jan 2024 00:03:23 +0100 Subject: [PATCH] more pack --- crates/generate/src/generate.rs | 1 + crates/generate/src/main.rs | 8 ++- crates/intringen/src/x86/generated.rs | 92 ++++++++++++++++++++++++++- crates/intringen/src/x86/mod.rs | 36 +++++++++-- 4 files changed, 131 insertions(+), 6 deletions(-) diff --git a/crates/generate/src/generate.rs b/crates/generate/src/generate.rs index b9cced3..d5b3ab7 100644 --- a/crates/generate/src/generate.rs +++ b/crates/generate/src/generate.rs @@ -198,6 +198,7 @@ impl VariableType { let (is_signed, elem_width) = match etype { "SI8" => (true, 8), "SI16" => (true, 16), + "SI32" => (true, 32), "UI8" => (false, 8), "UI16" => (false, 16), _ => bail!("unknown element type: {etype}"), diff --git a/crates/generate/src/main.rs b/crates/generate/src/main.rs index af82a41..0d4c013 100644 --- a/crates/generate/src/main.rs +++ b/crates/generate/src/main.rs @@ -88,4 +88,10 @@ fn main() -> Result<()> { Ok(()) } -const INTRINSICS_GENERATE: &[&str] = &["_mm_packus_epi16", "_mm_packs_epi16", "_mm_setr_epi16"]; +const INTRINSICS_GENERATE: &[&str] = &[ + "_mm_packus_epi16", + "_mm_packs_epi16", + "_mm_packus_epi32", + "_mm_packs_epi32", + "_mm_setr_epi16", +]; diff --git a/crates/intringen/src/x86/generated.rs b/crates/intringen/src/x86/generated.rs index 7a4602e..3ea2658 100644 --- a/crates/intringen/src/x86/generated.rs +++ b/crates/intringen/src/x86/generated.rs @@ -84,6 +84,37 @@ pub trait Intrinsics: super::Core { let __tmp = self.saturate8(__tmp); self.set_lane___m128i_i8(dst, 15u64, __tmp); } + fn _mm_packs_epi32( + &mut self, + dst: &mut Self::__m128i, + a: Self::__m128i, + b: Self::__m128i, + ) { + let __tmp = self.get_lane___m128i_i32(a, 0u64); + let __tmp = self.saturate16(__tmp); + self.set_lane___m128i_i16(dst, 0u64, __tmp); + let __tmp = self.get_lane___m128i_i32(a, 1u64); + let __tmp = self.saturate16(__tmp); + self.set_lane___m128i_i16(dst, 1u64, __tmp); + let __tmp = self.get_lane___m128i_i32(a, 2u64); + let __tmp = self.saturate16(__tmp); + self.set_lane___m128i_i16(dst, 2u64, __tmp); + let __tmp = self.get_lane___m128i_i32(a, 3u64); + let __tmp = self.saturate16(__tmp); + self.set_lane___m128i_i16(dst, 3u64, __tmp); + let __tmp = self.get_lane___m128i_i32(b, 0u64); + let __tmp = self.saturate16(__tmp); + self.set_lane___m128i_i16(dst, 4u64, __tmp); + let __tmp = self.get_lane___m128i_i32(b, 1u64); + let __tmp = self.saturate16(__tmp); + self.set_lane___m128i_i16(dst, 5u64, __tmp); + let __tmp = self.get_lane___m128i_i32(b, 2u64); + let __tmp = self.saturate16(__tmp); + self.set_lane___m128i_i16(dst, 6u64, __tmp); + let __tmp = self.get_lane___m128i_i32(b, 3u64); + let __tmp = self.saturate16(__tmp); + self.set_lane___m128i_i16(dst, 7u64, __tmp); + } fn _mm_packus_epi16( &mut self, dst: &mut Self::__m128i, @@ -139,6 +170,37 @@ pub trait Intrinsics: super::Core { let __tmp = self.saturate_u8(__tmp); self.set_lane___m128i_u8(dst, 15u64, __tmp); } + fn _mm_packus_epi32( + &mut self, + dst: &mut Self::__m128i, + a: Self::__m128i, + b: Self::__m128i, + ) { + let __tmp = self.get_lane___m128i_i32(a, 0u64); + let __tmp = self.saturate_u16(__tmp); + self.set_lane___m128i_u16(dst, 0u64, __tmp); + let __tmp = self.get_lane___m128i_i32(a, 1u64); + let __tmp = self.saturate_u16(__tmp); + self.set_lane___m128i_u16(dst, 1u64, __tmp); + let __tmp = self.get_lane___m128i_i32(a, 2u64); + let __tmp = self.saturate_u16(__tmp); + self.set_lane___m128i_u16(dst, 2u64, __tmp); + let __tmp = self.get_lane___m128i_i32(a, 3u64); + let __tmp = self.saturate_u16(__tmp); + self.set_lane___m128i_u16(dst, 3u64, __tmp); + let __tmp = self.get_lane___m128i_i32(b, 0u64); + let __tmp = self.saturate_u16(__tmp); + self.set_lane___m128i_u16(dst, 4u64, __tmp); + let __tmp = self.get_lane___m128i_i32(b, 1u64); + let __tmp = self.saturate_u16(__tmp); + self.set_lane___m128i_u16(dst, 5u64, __tmp); + let __tmp = self.get_lane___m128i_i32(b, 2u64); + let __tmp = self.saturate_u16(__tmp); + self.set_lane___m128i_u16(dst, 6u64, __tmp); + let __tmp = self.get_lane___m128i_i32(b, 3u64); + let __tmp = self.saturate_u16(__tmp); + self.set_lane___m128i_u16(dst, 7u64, __tmp); + } } pub mod soft_arch { pub use super::super::soft_arch_types::*; @@ -163,11 +225,21 @@ pub mod soft_arch { super::super::ValueCore._mm_packs_epi16(&mut output, a, b); output } + pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i { + let mut output = unsafe { std::mem::zeroed() }; + super::super::ValueCore._mm_packs_epi32(&mut output, a, b); + output + } pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i { let mut output = unsafe { std::mem::zeroed() }; super::super::ValueCore._mm_packus_epi16(&mut output, a, b); output } + pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i { + let mut output = unsafe { std::mem::zeroed() }; + super::super::ValueCore._mm_packus_epi32(&mut output, a, b); + output + } } #[cfg(all(test, target_arch = "x86_64"))] pub mod tests { @@ -190,13 +262,31 @@ pub mod tests { } } #[test] - fn _mm_packus_epi16() { + fn _mm_packs_epi32() { hard_soft_same_128! { { let a = _mm_setr_epi16(18077i16, 23617i16, - 9205i16, 21233i16, - 4332i16, - 31339i16, 23623i16, - 22080i16); let b = _mm_setr_epi16(- 1436i16, - 30227i16, 8629i16, 10922i16, - 16731i16, - 1013i16, - 14310i16, 2892i16); + _mm_packs_epi32(a, b) } + } + } + #[test] + fn _mm_packus_epi16() { + hard_soft_same_128! { + { let a = _mm_setr_epi16(- 28568i16, 12614i16, 20103i16, 32412i16, - + 28704i16, - 27930i16, 4197i16, 1829i16); let b = _mm_setr_epi16(9149i16, + 18759i16, 30885i16, - 3879i16, 21600i16, 24454i16, 23524i16, 10765i16); _mm_packus_epi16(a, b) } } } + #[test] + fn _mm_packus_epi32() { + hard_soft_same_128! { + { let a = _mm_setr_epi16(32539i16, 26890i16, - 3892i16, 4386i16, 18704i16, + 8253i16, - 29217i16, 32013i16); let b = _mm_setr_epi16(7448i16, 2172i16, - + 14764i16, - 1068i16, - 25463i16, 21215i16, - 31392i16, - 14015i16); + _mm_packus_epi32(a, b) } + } + } } diff --git a/crates/intringen/src/x86/mod.rs b/crates/intringen/src/x86/mod.rs index 16aaf5d..a8e28b8 100644 --- a/crates/intringen/src/x86/mod.rs +++ b/crates/intringen/src/x86/mod.rs @@ -21,13 +21,18 @@ pub trait Core { fn get_lane___m128i_u16(&mut self, value: Self::__m128i, idx: u64) -> Self::u16; fn get_lane___m128i_i16(&mut self, value: Self::__m128i, idx: u64) -> Self::i16; + fn get_lane___m128i_u32(&mut self, value: Self::__m128i, idx: u64) -> Self::u32; + fn get_lane___m128i_i32(&mut self, value: Self::__m128i, idx: u64) -> Self::i32; fn set_lane___m128i_u8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u8); fn set_lane___m128i_i8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i8); fn set_lane___m128i_u16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u16); + fn set_lane___m128i_i16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i16); fn saturate8(&mut self, elem: Self::i16) -> Self::i8; fn saturate_u8(&mut self, elem: Self::i16) -> Self::u8; + fn saturate16(&mut self, elem: Self::i32) -> Self::i16; + fn saturate_u16(&mut self, elem: Self::i32) -> Self::u16; } pub struct ValueCore; @@ -57,10 +62,20 @@ impl Core for ValueCore { } fn get_lane___m128i_i16(&mut self, value: Self::__m128i, idx: u64) -> Self::i16 { - let first = value[(idx * 2 + 1) as usize]; - let second = value[(idx * 2) as usize]; + self.get_lane___m128i_u16(value, idx) as i16 + } - ((((first as u16) << 8) as u16) | (second as u16)) as i16 + fn get_lane___m128i_u32(&mut self, value: Self::__m128i, idx: u64) -> Self::u32 { + let first = value[(idx * 4 + 3) as usize]; + let second = value[(idx * 4 + 2) as usize]; + let third = value[(idx * 4 + 1) as usize]; + let fourth = value[(idx * 4) as usize]; + + ((first as u32) << 24) | ((second as u32) << 16) | ((third as u32) << 8) | (fourth as u32) + } + + fn get_lane___m128i_i32(&mut self, value: Self::__m128i, idx: u64) -> Self::i32 { + self.get_lane___m128i_u32(value, idx) as i32 } fn set_lane___m128i_u8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u8) { @@ -68,7 +83,7 @@ impl Core for ValueCore { } fn set_lane___m128i_i8(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i8) { - place[idx as usize] = value as u8; + self.set_lane___m128i_u8(place, idx, value as u8); } fn set_lane___m128i_u16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::u16) { @@ -78,6 +93,10 @@ impl Core for ValueCore { place[(idx * 2 + 1) as usize] = second; } + fn set_lane___m128i_i16(&mut self, place: &mut Self::__m128i, idx: u64, value: Self::i16) { + self.set_lane___m128i_u16(place, idx, value as u16); + } + fn saturate8(&mut self, elem: Self::i16) -> Self::i8 { let clamp = elem.clamp(i8::MIN as i16, i8::MAX as i16); clamp as i8 @@ -87,6 +106,15 @@ impl Core for ValueCore { let clamp = elem.clamp(0, u8::MAX as i16); clamp as u8 } + + fn saturate16(&mut self, elem: Self::i32) -> Self::i16 { + let clamp = elem.clamp(i16::MIN as i32, i16::MAX as i32); + clamp as i16 + } + fn saturate_u16(&mut self, elem: Self::i32) -> Self::u16 { + let clamp = elem.clamp(0, u16::MAX as i32); + clamp as u16 + } } mod soft_arch_types {