Documentation ¶
Overview ¶
THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!
THEY DO NOT NOT CONTAIN WORKING INTRINSICS!
Index ¶
- func AbsEpi64(a x86.M128i) (dst x86.M128i)
- func AddRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func AddRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func CmpEpi32Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
- func CmpEpi64Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
- func CmpEpu32Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
- func CmpEpu64Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
- func CmpPdMask(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)
- func CmpPsMask(a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)
- func CmpRoundSdMask(a x86.M128d, b x86.M128d, imm8 byte, sae int) (dst x86.Mmask8)
- func CmpRoundSsMask(a x86.M128, b x86.M128, imm8 byte, sae int) (dst x86.Mmask8)
- func CmpSdMask(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)
- func CmpSsMask(a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)
- func CmpeqEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpeqEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpeqEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpeqEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpgeEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpgeEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpgeEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpgeEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpgtEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpgtEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpgtEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpgtEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpleEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpleEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpleEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpleEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpltEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpltEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpltEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpltEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpneqEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpneqEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpneqEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func CmpneqEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func ComiRoundSd(a x86.M128d, b x86.M128d, imm8 byte, sae int) int
- func ComiRoundSs(a x86.M128, b x86.M128, imm8 byte, sae int) int
- func CvtRoundi32Ss(a x86.M128, b int, rounding int) (dst x86.M128)
- func CvtRoundi64Sd(a x86.M128d, b int64, rounding int) (dst x86.M128d)
- func CvtRoundi64Ss(a x86.M128, b int64, rounding int) (dst x86.M128)
- func CvtRoundsdI32(a x86.M128d, rounding int) int
- func CvtRoundsdI64(a x86.M128d, rounding int) int64
- func CvtRoundsdSi32(a x86.M128d, rounding int) int
- func CvtRoundsdSi64(a x86.M128d, rounding int) int64
- func CvtRoundsdSs(a x86.M128, b x86.M128d, rounding int) (dst x86.M128)
- func CvtRoundsdU32(a x86.M128d, rounding int) uint32
- func CvtRoundsdU64(a x86.M128d, rounding int) uint64
- func CvtRoundsi32Ss(a x86.M128, b int, rounding int) (dst x86.M128)
- func CvtRoundsi64Sd(a x86.M128d, b int64, rounding int) (dst x86.M128d)
- func CvtRoundsi64Ss(a x86.M128, b int64, rounding int) (dst x86.M128)
- func CvtRoundssI32(a x86.M128, rounding int) int
- func CvtRoundssI64(a x86.M128, rounding int) int64
- func CvtRoundssSd(a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)
- func CvtRoundssSi32(a x86.M128, rounding int) int
- func CvtRoundssSi64(a x86.M128, rounding int) int64
- func CvtRoundssU32(a x86.M128, rounding int) uint32
- func CvtRoundssU64(a x86.M128, rounding int) uint64
- func CvtRoundu32Ss(a x86.M128, b uint32, rounding int) (dst x86.M128)
- func CvtRoundu64Sd(a x86.M128d, b uint64, rounding int) (dst x86.M128d)
- func CvtRoundu64Ss(a x86.M128, b uint64, rounding int) (dst x86.M128)
- func Cvtepi32Epi16(a x86.M128i) (dst x86.M128i)
- func Cvtepi32Epi8(a x86.M128i) (dst x86.M128i)
- func Cvtepi64Epi16(a x86.M128i) (dst x86.M128i)
- func Cvtepi64Epi32(a x86.M128i) (dst x86.M128i)
- func Cvtepi64Epi8(a x86.M128i) (dst x86.M128i)
- func Cvtepu32Pd(a x86.M128i) (dst x86.M128d)
- func Cvti32Sd(a x86.M128d, b int) (dst x86.M128d)
- func Cvti32Ss(a x86.M128, b int) (dst x86.M128)
- func Cvti64Sd(a x86.M128d, b int64) (dst x86.M128d)
- func Cvti64Ss(a x86.M128, b int64) (dst x86.M128)
- func CvtpdEpu32(a x86.M128d) (dst x86.M128i)
- func CvtpsEpu32(a x86.M128) (dst x86.M128i)
- func CvtsdI32(a x86.M128d) int
- func CvtsdI64(a x86.M128d) int64
- func CvtsdU32(a x86.M128d) uint32
- func CvtsdU64(a x86.M128d) uint64
- func Cvtsepi32Epi16(a x86.M128i) (dst x86.M128i)
- func Cvtsepi32Epi8(a x86.M128i) (dst x86.M128i)
- func Cvtsepi64Epi16(a x86.M128i) (dst x86.M128i)
- func Cvtsepi64Epi32(a x86.M128i) (dst x86.M128i)
- func Cvtsepi64Epi8(a x86.M128i) (dst x86.M128i)
- func CvtssI32(a x86.M128) int
- func CvtssI64(a x86.M128) int64
- func CvtssU32(a x86.M128) uint32
- func CvtssU64(a x86.M128) uint64
- func CvttRoundsdI32(a x86.M128d, rounding int) int
- func CvttRoundsdI64(a x86.M128d, rounding int) int64
- func CvttRoundsdSi32(a x86.M128d, rounding int) int
- func CvttRoundsdSi64(a x86.M128d, rounding int) int64
- func CvttRoundsdU32(a x86.M128d, rounding int) uint32
- func CvttRoundsdU64(a x86.M128d, rounding int) uint64
- func CvttRoundssI32(a x86.M128, rounding int) int
- func CvttRoundssI64(a x86.M128, rounding int) int64
- func CvttRoundssSi32(a x86.M128, rounding int) int
- func CvttRoundssSi64(a x86.M128, rounding int) int64
- func CvttRoundssU32(a x86.M128, rounding int) uint32
- func CvttRoundssU64(a x86.M128, rounding int) uint64
- func CvttpdEpu32(a x86.M128d) (dst x86.M128i)
- func CvttpsEpu32(a x86.M128) (dst x86.M128i)
- func CvttsdI32(a x86.M128d) int
- func CvttsdI64(a x86.M128d) int64
- func CvttsdU32(a x86.M128d) uint32
- func CvttsdU64(a x86.M128d) uint64
- func CvttssI32(a x86.M128) int
- func CvttssI64(a x86.M128) int64
- func CvttssU32(a x86.M128) uint32
- func CvttssU64(a x86.M128) uint64
- func Cvtu32Sd(a x86.M128d, b uint32) (dst x86.M128d)
- func Cvtu32Ss(a x86.M128, b uint32) (dst x86.M128)
- func Cvtu64Sd(a x86.M128d, b uint64) (dst x86.M128d)
- func Cvtu64Ss(a x86.M128, b uint64) (dst x86.M128)
- func Cvtusepi32Epi16(a x86.M128i) (dst x86.M128i)
- func Cvtusepi32Epi8(a x86.M128i) (dst x86.M128i)
- func Cvtusepi64Epi16(a x86.M128i) (dst x86.M128i)
- func Cvtusepi64Epi32(a x86.M128i) (dst x86.M128i)
- func Cvtusepi64Epi8(a x86.M128i) (dst x86.M128i)
- func DivRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func DivRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func FixupimmPd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
- func FixupimmPs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
- func FixupimmRoundSd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)
- func FixupimmRoundSs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)
- func FixupimmSd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
- func FixupimmSs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
- func GetexpPd(a x86.M128d) (dst x86.M128d)
- func GetexpPs(a x86.M128) (dst x86.M128)
- func GetexpRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func GetexpRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func GetexpSd(a x86.M128d, b x86.M128d) (dst x86.M128d)
- func GetexpSs(a x86.M128, b x86.M128) (dst x86.M128)
- func GetmantPd(a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
- func GetmantPs(a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
- func GetmantRoundSd(a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, ...) (dst x86.M128d)
- func GetmantRoundSs(a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, ...) (dst x86.M128)
- func GetmantSd(a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
- func GetmantSs(a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
- func M256AbsEpi64(a x86.M256i) (dst x86.M256i)
- func M256BroadcastF32x4(a x86.M128) (dst x86.M256)
- func M256BroadcastI32x4(a x86.M128i) (dst x86.M256i)
- func M256CmpEpi32Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
- func M256CmpEpi64Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
- func M256CmpEpu32Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
- func M256CmpEpu64Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
- func M256CmpPdMask(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.Mmask8)
- func M256CmpPsMask(a x86.M256, b x86.M256, imm8 byte) (dst x86.Mmask8)
- func M256CmpeqEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpeqEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpeqEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpeqEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpgeEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpgeEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpgeEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpgeEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpgtEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpgtEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpgtEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpgtEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpleEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpleEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpleEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpleEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpltEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpltEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpltEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpltEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpneqEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpneqEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpneqEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256CmpneqEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256Cvtepi32Epi16(a x86.M256i) (dst x86.M128i)
- func M256Cvtepi32Epi8(a x86.M256i) (dst x86.M128i)
- func M256Cvtepi64Epi16(a x86.M256i) (dst x86.M128i)
- func M256Cvtepi64Epi32(a x86.M256i) (dst x86.M128i)
- func M256Cvtepi64Epi8(a x86.M256i) (dst x86.M128i)
- func M256Cvtepu32Pd(a x86.M128i) (dst x86.M256d)
- func M256CvtpdEpu32(a x86.M256d) (dst x86.M128i)
- func M256CvtpsEpu32(a x86.M256) (dst x86.M256i)
- func M256Cvtsepi32Epi16(a x86.M256i) (dst x86.M128i)
- func M256Cvtsepi32Epi8(a x86.M256i) (dst x86.M128i)
- func M256Cvtsepi64Epi16(a x86.M256i) (dst x86.M128i)
- func M256Cvtsepi64Epi32(a x86.M256i) (dst x86.M128i)
- func M256Cvtsepi64Epi8(a x86.M256i) (dst x86.M128i)
- func M256CvttpdEpu32(a x86.M256d) (dst x86.M128i)
- func M256CvttpsEpu32(a x86.M256) (dst x86.M256i)
- func M256Cvtusepi32Epi16(a x86.M256i) (dst x86.M128i)
- func M256Cvtusepi32Epi8(a x86.M256i) (dst x86.M128i)
- func M256Cvtusepi64Epi16(a x86.M256i) (dst x86.M128i)
- func M256Cvtusepi64Epi32(a x86.M256i) (dst x86.M128i)
- func M256Cvtusepi64Epi8(a x86.M256i) (dst x86.M128i)
- func M256Extractf32x4Ps(a x86.M256, imm8 byte) (dst x86.M128)
- func M256Extracti32x4Epi32(a x86.M256i, imm8 byte) (dst x86.M128i)
- func M256FixupimmPd(a x86.M256d, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)
- func M256FixupimmPs(a x86.M256, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)
- func M256GetexpPd(a x86.M256d) (dst x86.M256d)
- func M256GetexpPs(a x86.M256) (dst x86.M256)
- func M256GetmantPd(a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)
- func M256GetmantPs(a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)
- func M256Insertf32x4(a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)
- func M256Inserti32x4(a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
- func M256Mask2Permutex2varEpi32(a x86.M256i, idx x86.M256i, k x86.Mmask8, b x86.M256i) (dst x86.M256i)
- func M256Mask2Permutex2varEpi64(a x86.M256i, idx x86.M256i, k x86.Mmask8, b x86.M256i) (dst x86.M256i)
- func M256Mask2Permutex2varPd(a x86.M256d, idx x86.M256i, k x86.Mmask8, b x86.M256d) (dst x86.M256d)
- func M256Mask2Permutex2varPs(a x86.M256, idx x86.M256i, k x86.Mmask8, b x86.M256) (dst x86.M256)
- func M256Mask3FmaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
- func M256Mask3FmaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
- func M256Mask3FmaddsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
- func M256Mask3FmaddsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
- func M256Mask3FmsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
- func M256Mask3FmsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
- func M256Mask3FmsubaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
- func M256Mask3FmsubaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
- func M256Mask3FnmaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
- func M256Mask3FnmaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
- func M256Mask3FnmsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
- func M256Mask3FnmsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
- func M256MaskAbsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskAbsEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskAddEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskAddEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskAndEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskAndEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskAndnotEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskAndnotEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskBlendEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskBlendEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskBlendPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskBlendPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskBroadcastF32x4(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)
- func M256MaskBroadcastI32x4(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskBroadcastdEpi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskBroadcastqEpi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskBroadcastsdPd(src x86.M256d, k x86.Mmask8, a x86.M128d) (dst x86.M256d)
- func M256MaskBroadcastssPs(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)
- func M256MaskCmpEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
- func M256MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
- func M256MaskCmpEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
- func M256MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
- func M256MaskCmpPdMask(k1 x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.Mmask8)
- func M256MaskCmpPsMask(k1 x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.Mmask8)
- func M256MaskCmpeqEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpeqEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpgeEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpgeEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpgtEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpgtEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpleEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpleEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpltEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpltEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpneqEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpneqEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskCompressEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskCompressEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskCompressPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskCompressPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskCvtRoundpsPh(src x86.M128i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)
- func M256MaskCvtepi16Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskCvtepi16Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskCvtepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtepi32Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtepi32Pd(src x86.M256d, k x86.Mmask8, a x86.M128i) (dst x86.M256d)
- func M256MaskCvtepi32Ps(src x86.M256, k x86.Mmask8, a x86.M256i) (dst x86.M256)
- func M256MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtepi8Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskCvtepi8Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskCvtepu16Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskCvtepu16Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskCvtepu32Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskCvtepu32Pd(src x86.M256d, k x86.Mmask8, a x86.M128i) (dst x86.M256d)
- func M256MaskCvtepu8Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskCvtepu8Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskCvtpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)
- func M256MaskCvtpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)
- func M256MaskCvtpdPs(src x86.M128, k x86.Mmask8, a x86.M256d) (dst x86.M128)
- func M256MaskCvtphPs(src x86.M256, k x86.Mmask8, a x86.M128i) (dst x86.M256)
- func M256MaskCvtpsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)
- func M256MaskCvtpsEpu32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)
- func M256MaskCvtpsPh(src x86.M128i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)
- func M256MaskCvtsepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtsepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvttpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)
- func M256MaskCvttpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)
- func M256MaskCvttpsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)
- func M256MaskCvttpsEpu32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)
- func M256MaskCvtusepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtusepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskDivPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskDivPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskExpandEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskExpandEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskExpandPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskExpandPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskExtractf32x4Ps(src x86.M128, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M128)
- func M256MaskExtracti32x4Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)
- func M256MaskFixupimmPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)
- func M256MaskFixupimmPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)
- func M256MaskFmaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskFmaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskFmaddsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskFmaddsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskFmsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskFmsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskFmsubaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskFmsubaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskFnmaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskFnmaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskFnmsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskFnmsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskGetexpPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskGetexpPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskGetmantPd(src x86.M256d, k x86.Mmask8, a x86.M256d, interv MMMANTISSANORMENUM, ...) (dst x86.M256d)
- func M256MaskGetmantPs(src x86.M256, k x86.Mmask8, a x86.M256, interv MMMANTISSANORMENUM, ...) (dst x86.M256)
- func M256MaskInsertf32x4(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)
- func M256MaskInserti32x4(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
- func M256MaskMaxEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskMaxEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskMaxEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskMaxEpu64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskMaxPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskMaxPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskMinEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskMinEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskMinEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskMinEpu64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskMinPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskMinPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskMovEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskMovEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskMovPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskMovPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskMovedupPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskMovehdupPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskMoveldupPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskMulEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskMulEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskMulPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskMulPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskMulloEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskOrEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskOrEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskPermutePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
- func M256MaskPermutePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)
- func M256MaskPermutevarPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256i) (dst x86.M256d)
- func M256MaskPermutevarPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256i) (dst x86.M256)
- func M256MaskPermutex2varEpi32(a x86.M256i, k x86.Mmask8, idx x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskPermutex2varEpi64(a x86.M256i, k x86.Mmask8, idx x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskPermutex2varPd(a x86.M256d, k x86.Mmask8, idx x86.M256i, b x86.M256d) (dst x86.M256d)
- func M256MaskPermutex2varPs(a x86.M256, k x86.Mmask8, idx x86.M256i, b x86.M256) (dst x86.M256)
- func M256MaskPermutexEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskPermutexPd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
- func M256MaskPermutexvarEpi32(src x86.M256i, k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)
- func M256MaskPermutexvarEpi64(src x86.M256i, k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)
- func M256MaskPermutexvarPd(src x86.M256d, k x86.Mmask8, idx x86.M256i, a x86.M256d) (dst x86.M256d)
- func M256MaskPermutexvarPs(src x86.M256, k x86.Mmask8, idx x86.M256i, a x86.M256) (dst x86.M256)
- func M256MaskRcp14Pd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskRcp14Ps(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskRolEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskRolEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskRolvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskRolvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskRorEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskRorEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskRorvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskRorvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskRoundscalePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
- func M256MaskRoundscalePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)
- func M256MaskRsqrt14Pd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskRsqrt14Ps(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskScalefPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskScalefPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskSet1Epi32(src x86.M256i, k x86.Mmask8, a int) (dst x86.M256i)
- func M256MaskSet1Epi64(src x86.M256i, k x86.Mmask8, a int64) (dst x86.M256i)
- func M256MaskShuffleEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskShuffleF32x4(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
- func M256MaskShuffleF64x2(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
- func M256MaskShuffleI32x4(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskShuffleI64x2(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskShufflePd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
- func M256MaskShufflePs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
- func M256MaskSllEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskSllEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskSlliEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskSlliEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskSllvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskSllvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskSqrtPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskSqrtPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskSraEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskSraEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskSraiEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskSraiEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskSravEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskSravEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskSrlEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskSrlEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskSrliEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskSrliEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskSrlvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskSrlvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskSubEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskSubEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskSubPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskSubPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskTernarylogicEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskTernarylogicEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskTestEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskTestnEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256MaskUnpackhiEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskUnpackhiEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskUnpackhiPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskUnpackhiPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskUnpackloEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskUnpackloEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskUnpackloPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskUnpackloPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskXorEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskXorEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzAbsEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskzAbsEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskzAddEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzAddEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzAndEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzAndEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzAndnotEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzAndnotEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzBroadcastF32x4(k x86.Mmask8, a x86.M128) (dst x86.M256)
- func M256MaskzBroadcastI32x4(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzBroadcastdEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzBroadcastsdPd(k x86.Mmask8, a x86.M128d) (dst x86.M256d)
- func M256MaskzBroadcastssPs(k x86.Mmask8, a x86.M128) (dst x86.M256)
- func M256MaskzCompressEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskzCompressEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskzCompressPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskzCompressPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskzCvtRoundpsPh(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)
- func M256MaskzCvtepi16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzCvtepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzCvtepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtepi32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M256d)
- func M256MaskzCvtepi32Ps(k x86.Mmask8, a x86.M256i) (dst x86.M256)
- func M256MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtepi8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzCvtepu16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzCvtepu32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M256d)
- func M256MaskzCvtepu8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
- func M256MaskzCvtpdEpi32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)
- func M256MaskzCvtpdEpu32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)
- func M256MaskzCvtpdPs(k x86.Mmask8, a x86.M256d) (dst x86.M128)
- func M256MaskzCvtphPs(k x86.Mmask8, a x86.M128i) (dst x86.M256)
- func M256MaskzCvtpsEpi32(k x86.Mmask8, a x86.M256) (dst x86.M256i)
- func M256MaskzCvtpsEpu32(k x86.Mmask8, a x86.M256) (dst x86.M256i)
- func M256MaskzCvtpsPh(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)
- func M256MaskzCvtsepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtsepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvttpdEpi32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)
- func M256MaskzCvttpdEpu32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)
- func M256MaskzCvttpsEpi32(k x86.Mmask8, a x86.M256) (dst x86.M256i)
- func M256MaskzCvttpsEpu32(k x86.Mmask8, a x86.M256) (dst x86.M256i)
- func M256MaskzCvtusepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtusepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
- func M256MaskzDivPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskzDivPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskzExpandEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskzExpandEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskzExpandPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskzExpandPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskzExtractf32x4Ps(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M128)
- func M256MaskzExtracti32x4Epi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)
- func M256MaskzFixupimmPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)
- func M256MaskzFixupimmPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)
- func M256MaskzFmaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskzFmaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskzFmaddsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskzFmaddsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskzFmsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskzFmsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskzFmsubaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskzFmsubaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskzFnmaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskzFnmaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskzFnmsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
- func M256MaskzFnmsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
- func M256MaskzGetexpPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskzGetexpPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskzGetmantPd(k x86.Mmask8, a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)
- func M256MaskzGetmantPs(k x86.Mmask8, a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)
- func M256MaskzInsertf32x4(k x86.Mmask8, a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)
- func M256MaskzInserti32x4(k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
- func M256MaskzMaxEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzMaxEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzMaxEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzMaxEpu64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzMaxPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskzMaxPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskzMinEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzMinEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzMinEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzMinEpu64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzMinPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskzMinPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskzMovEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskzMovEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
- func M256MaskzMovPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskzMovPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskzMovedupPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskzMovehdupPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskzMoveldupPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskzMulEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzMulEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzMulPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskzMulPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskzMulloEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzOrEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzOrEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzPermutePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
- func M256MaskzPermutePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)
- func M256MaskzPermutevarPd(k x86.Mmask8, a x86.M256d, b x86.M256i) (dst x86.M256d)
- func M256MaskzPermutevarPs(k x86.Mmask8, a x86.M256, b x86.M256i) (dst x86.M256)
- func M256MaskzPermutex2varEpi32(k x86.Mmask8, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzPermutex2varPd(k x86.Mmask8, a x86.M256d, idx x86.M256i, b x86.M256d) (dst x86.M256d)
- func M256MaskzPermutex2varPs(k x86.Mmask8, a x86.M256, idx x86.M256i, b x86.M256) (dst x86.M256)
- func M256MaskzPermutexEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzPermutexPd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
- func M256MaskzPermutexvarEpi32(k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)
- func M256MaskzPermutexvarEpi64(k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)
- func M256MaskzPermutexvarPd(k x86.Mmask8, idx x86.M256i, a x86.M256d) (dst x86.M256d)
- func M256MaskzPermutexvarPs(k x86.Mmask8, idx x86.M256i, a x86.M256) (dst x86.M256)
- func M256MaskzRcp14Pd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskzRcp14Ps(k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskzRolEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzRolEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzRolvEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzRolvEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzRorEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzRorEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzRorvEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzRorvEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzRoundscalePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
- func M256MaskzRoundscalePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)
- func M256MaskzRsqrt14Pd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskzRsqrt14Ps(k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskzScalefPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskzScalefPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskzSet1Epi32(k x86.Mmask8, a int) (dst x86.M256i)
- func M256MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M256i)
- func M256MaskzShuffleEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzShuffleF32x4(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
- func M256MaskzShuffleF64x2(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
- func M256MaskzShuffleI32x4(k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzShuffleI64x2(k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzShufflePd(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
- func M256MaskzShufflePs(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
- func M256MaskzSllEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskzSllEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskzSlliEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzSlliEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzSllvEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskzSllvEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskzSqrtPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
- func M256MaskzSqrtPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
- func M256MaskzSraEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskzSraEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskzSraiEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzSraiEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzSravEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskzSravEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskzSrlEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskzSrlEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256MaskzSrliEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzSrliEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzSrlvEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskzSrlvEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256MaskzSubEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzSubEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzSubPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskzSubPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskzTernarylogicEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)
- func M256MaskzUnpackhiEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzUnpackhiPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskzUnpackhiPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskzUnpackloEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzUnpackloEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzUnpackloPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256MaskzUnpackloPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
- func M256MaskzXorEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaskzXorEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaxEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MaxEpu64(a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MinEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256MinEpu64(a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256Permutex2varEpi32(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256Permutex2varEpi64(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256Permutex2varPd(a x86.M256d, idx x86.M256i, b x86.M256d) (dst x86.M256d)
- func M256Permutex2varPs(a x86.M256, idx x86.M256i, b x86.M256) (dst x86.M256)
- func M256PermutexEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256PermutexPd(a x86.M256d, imm8 byte) (dst x86.M256d)
- func M256PermutexvarEpi32(idx x86.M256i, a x86.M256i) (dst x86.M256i)
- func M256PermutexvarEpi64(idx x86.M256i, a x86.M256i) (dst x86.M256i)
- func M256PermutexvarPd(idx x86.M256i, a x86.M256d) (dst x86.M256d)
- func M256PermutexvarPs(idx x86.M256i, a x86.M256) (dst x86.M256)
- func M256Rcp14Pd(a x86.M256d) (dst x86.M256d)
- func M256Rcp14Ps(a x86.M256) (dst x86.M256)
- func M256RolEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256RolEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256RolvEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256RolvEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256RorEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256RorEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256RorvEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256RorvEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
- func M256RoundscalePd(a x86.M256d, imm8 byte) (dst x86.M256d)
- func M256RoundscalePs(a x86.M256, imm8 byte) (dst x86.M256)
- func M256ScalefPd(a x86.M256d, b x86.M256d) (dst x86.M256d)
- func M256ScalefPs(a x86.M256, b x86.M256) (dst x86.M256)
- func M256ShuffleF32x4(a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
- func M256ShuffleF64x2(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
- func M256ShuffleI32x4(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
- func M256ShuffleI64x2(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
- func M256SraEpi64(a x86.M256i, count x86.M128i) (dst x86.M256i)
- func M256SraiEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)
- func M256SravEpi64(a x86.M256i, count x86.M256i) (dst x86.M256i)
- func M256TernarylogicEpi32(a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)
- func M256TernarylogicEpi64(a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)
- func M256TestEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256TestEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256TestnEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M256TestnEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
- func M512AbsEpi32(a x86.M512i) (dst x86.M512i)
- func M512AbsEpi64(a x86.M512i) (dst x86.M512i)
- func M512AcosPd(a x86.M512d) (dst x86.M512d)
- func M512AcosPs(a x86.M512) (dst x86.M512)
- func M512AcoshPd(a x86.M512d) (dst x86.M512d)
- func M512AcoshPs(a x86.M512) (dst x86.M512)
- func M512AddEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512AlignrEpi64(a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
- func M512AsinPd(a x86.M512d) (dst x86.M512d)
- func M512AsinPs(a x86.M512) (dst x86.M512)
- func M512AsinhPd(a x86.M512d) (dst x86.M512d)
- func M512AsinhPs(a x86.M512) (dst x86.M512)
- func M512Atan2Pd(a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512Atan2Ps(a x86.M512, b x86.M512) (dst x86.M512)
- func M512AtanPd(a x86.M512d) (dst x86.M512d)
- func M512AtanPs(a x86.M512) (dst x86.M512)
- func M512AtanhPd(a x86.M512d) (dst x86.M512d)
- func M512AtanhPs(a x86.M512) (dst x86.M512)
- func M512BroadcastF32x4(a x86.M128) (dst x86.M512)
- func M512BroadcastF64x4(a x86.M256d) (dst x86.M512d)
- func M512BroadcastI32x4(a x86.M128i) (dst x86.M512i)
- func M512BroadcastI64x4(a x86.M256i) (dst x86.M512i)
- func M512BroadcastdEpi32(a x86.M128i) (dst x86.M512i)
- func M512BroadcastqEpi64(a x86.M128i) (dst x86.M512i)
- func M512BroadcastsdPd(a x86.M128d) (dst x86.M512d)
- func M512BroadcastssPs(a x86.M128) (dst x86.M512)
- func M512Castpd128Pd512(a x86.M128d) (dst x86.M512d)
- func M512Castpd256Pd512(a x86.M256d) (dst x86.M512d)
- func M512Castpd512Pd128(a x86.M512d) (dst x86.M128d)
- func M512Castpd512Pd256(a x86.M512d) (dst x86.M256d)
- func M512Castps128Ps512(a x86.M128) (dst x86.M512)
- func M512Castps256Ps512(a x86.M256) (dst x86.M512)
- func M512Castps512Ps128(a x86.M512) (dst x86.M128)
- func M512Castps512Ps256(a x86.M512) (dst x86.M256)
- func M512Castsi128Si512(a x86.M128i) (dst x86.M512i)
- func M512Castsi256Si512(a x86.M256i) (dst x86.M512i)
- func M512Castsi512Si128(a x86.M512i) (dst x86.M128i)
- func M512Castsi512Si256(a x86.M512i) (dst x86.M256i)
- func M512CbrtPd(a x86.M512d) (dst x86.M512d)
- func M512CbrtPs(a x86.M512) (dst x86.M512)
- func M512CdfnormPd(a x86.M512d) (dst x86.M512d)
- func M512CdfnormPs(a x86.M512) (dst x86.M512)
- func M512CdfnorminvPd(a x86.M512d) (dst x86.M512d)
- func M512CdfnorminvPs(a x86.M512) (dst x86.M512)
- func M512CeilPd(a x86.M512d) (dst x86.M512d)
- func M512CeilPs(a x86.M512) (dst x86.M512)
- func M512CmpEpi64Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)
- func M512CmpEpu64Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)
- func M512CmpeqEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpeqEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpgeEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpgeEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpgtEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpgtEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpleEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpleEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpltEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
- func M512CmpltEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpltEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpneqEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CmpneqEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512CosPd(a x86.M512d) (dst x86.M512d)
- func M512CosPs(a x86.M512) (dst x86.M512)
- func M512CosdPd(a x86.M512d) (dst x86.M512d)
- func M512CosdPs(a x86.M512) (dst x86.M512)
- func M512CoshPd(a x86.M512d) (dst x86.M512d)
- func M512CoshPs(a x86.M512) (dst x86.M512)
- func M512CvtRoundepi32Ps(a x86.M512i, rounding int) (dst x86.M512)
- func M512CvtRoundepu32Ps(a x86.M512i, rounding int) (dst x86.M512)
- func M512CvtRoundpdEpi32(a x86.M512d, rounding int) (dst x86.M256i)
- func M512CvtRoundpdEpu32(a x86.M512d, rounding int) (dst x86.M256i)
- func M512CvtRoundpdPs(a x86.M512d, rounding int) (dst x86.M256)
- func M512CvtRoundphPs(a x86.M256i, sae int) (dst x86.M512)
- func M512CvtRoundpsEpi32(a x86.M512, rounding int) (dst x86.M512i)
- func M512CvtRoundpsEpu32(a x86.M512, rounding int) (dst x86.M512i)
- func M512CvtRoundpsPd(a x86.M256, sae int) (dst x86.M512d)
- func M512CvtRoundpsPh(a x86.M512, rounding int) (dst x86.M256i)
- func M512Cvtepi16Epi32(a x86.M256i) (dst x86.M512i)
- func M512Cvtepi16Epi64(a x86.M128i) (dst x86.M512i)
- func M512Cvtepi32Epi16(a x86.M512i) (dst x86.M256i)
- func M512Cvtepi32Epi64(a x86.M256i) (dst x86.M512i)
- func M512Cvtepi32Epi8(a x86.M512i) (dst x86.M128i)
- func M512Cvtepi32Pd(a x86.M256i) (dst x86.M512d)
- func M512Cvtepi32Ps(a x86.M512i) (dst x86.M512)
- func M512Cvtepi64Epi16(a x86.M512i) (dst x86.M128i)
- func M512Cvtepi64Epi32(a x86.M512i) (dst x86.M256i)
- func M512Cvtepi64Epi8(a x86.M512i) (dst x86.M128i)
- func M512Cvtepi8Epi32(a x86.M128i) (dst x86.M512i)
- func M512Cvtepi8Epi64(a x86.M128i) (dst x86.M512i)
- func M512Cvtepu16Epi32(a x86.M256i) (dst x86.M512i)
- func M512Cvtepu16Epi64(a x86.M128i) (dst x86.M512i)
- func M512Cvtepu32Epi64(a x86.M256i) (dst x86.M512i)
- func M512Cvtepu32Pd(a x86.M256i) (dst x86.M512d)
- func M512Cvtepu32Ps(a x86.M512i) (dst x86.M512)
- func M512Cvtepu8Epi32(a x86.M128i) (dst x86.M512i)
- func M512Cvtepu8Epi64(a x86.M128i) (dst x86.M512i)
- func M512CvtpdEpi32(a x86.M512d) (dst x86.M256i)
- func M512CvtpdEpu32(a x86.M512d) (dst x86.M256i)
- func M512CvtpdPs(a x86.M512d) (dst x86.M256)
- func M512CvtphPs(a x86.M256i) (dst x86.M512)
- func M512CvtpsEpi32(a x86.M512) (dst x86.M512i)
- func M512CvtpsEpu32(a x86.M512) (dst x86.M512i)
- func M512CvtpsPd(a x86.M256) (dst x86.M512d)
- func M512CvtpsPh(a x86.M512, rounding int) (dst x86.M256i)
- func M512Cvtsepi32Epi16(a x86.M512i) (dst x86.M256i)
- func M512Cvtsepi32Epi8(a x86.M512i) (dst x86.M128i)
- func M512Cvtsepi64Epi16(a x86.M512i) (dst x86.M128i)
- func M512Cvtsepi64Epi32(a x86.M512i) (dst x86.M256i)
- func M512Cvtsepi64Epi8(a x86.M512i) (dst x86.M128i)
- func M512CvttRoundpdEpi32(a x86.M512d, sae int) (dst x86.M256i)
- func M512CvttRoundpdEpu32(a x86.M512d, sae int) (dst x86.M256i)
- func M512CvttRoundpsEpi32(a x86.M512, sae int) (dst x86.M512i)
- func M512CvttRoundpsEpu32(a x86.M512, sae int) (dst x86.M512i)
- func M512CvttpdEpi32(a x86.M512d) (dst x86.M256i)
- func M512CvttpdEpu32(a x86.M512d) (dst x86.M256i)
- func M512CvttpsEpi32(a x86.M512) (dst x86.M512i)
- func M512CvttpsEpu32(a x86.M512) (dst x86.M512i)
- func M512Cvtusepi32Epi16(a x86.M512i) (dst x86.M256i)
- func M512Cvtusepi32Epi8(a x86.M512i) (dst x86.M128i)
- func M512Cvtusepi64Epi16(a x86.M512i) (dst x86.M128i)
- func M512Cvtusepi64Epi32(a x86.M512i) (dst x86.M256i)
- func M512Cvtusepi64Epi8(a x86.M512i) (dst x86.M128i)
- func M512DivEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512DivEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512DivEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512DivEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512DivEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512DivEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512DivEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512DivEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512DivPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512DivPs(a x86.M512, b x86.M512) (dst x86.M512)
- func M512DivRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
- func M512DivRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)
- func M512ErfPd(a x86.M512d) (dst x86.M512d)
- func M512ErfPs(a x86.M512) (dst x86.M512)
- func M512ErfcPd(a x86.M512d) (dst x86.M512d)
- func M512ErfcPs(a x86.M512) (dst x86.M512)
- func M512ErfcinvPd(a x86.M512d) (dst x86.M512d)
- func M512ErfcinvPs(a x86.M512) (dst x86.M512)
- func M512ErfinvPd(a x86.M512d) (dst x86.M512d)
- func M512ErfinvPs(a x86.M512) (dst x86.M512)
- func M512Exp10Pd(a x86.M512d) (dst x86.M512d)
- func M512Exp10Ps(a x86.M512) (dst x86.M512)
- func M512Exp2Pd(a x86.M512d) (dst x86.M512d)
- func M512Exp2Ps(a x86.M512) (dst x86.M512)
- func M512ExpPd(a x86.M512d) (dst x86.M512d)
- func M512ExpPs(a x86.M512) (dst x86.M512)
- func M512Expm1Pd(a x86.M512d) (dst x86.M512d)
- func M512Expm1Ps(a x86.M512) (dst x86.M512)
- func M512Extractf32x4Ps(a x86.M512, imm8 byte) (dst x86.M128)
- func M512Extractf64x4Pd(a x86.M512d, imm8 byte) (dst x86.M256d)
- func M512Extracti32x4Epi32(a x86.M512i, imm8 byte) (dst x86.M128i)
- func M512Extracti64x4Epi64(a x86.M512i, imm8 byte) (dst x86.M256i)
- func M512FixupimmPd(a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)
- func M512FixupimmPs(a x86.M512, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)
- func M512FixupimmRoundPd(a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)
- func M512FixupimmRoundPs(a x86.M512, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)
- func M512FloorPd(a x86.M512d) (dst x86.M512d)
- func M512FloorPs(a x86.M512) (dst x86.M512)
- func M512FmaddsubPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
- func M512FmaddsubPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
- func M512FmaddsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
- func M512FmaddsubRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
- func M512FmsubaddPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
- func M512FmsubaddPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
- func M512FmsubaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
- func M512FmsubaddRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
- func M512HypotPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512HypotPs(a x86.M512, b x86.M512) (dst x86.M512)
- func M512Insertf32x4(a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)
- func M512Insertf64x4(a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)
- func M512Inserti32x4(a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)
- func M512Inserti64x4(a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)
- func M512InvsqrtPd(a x86.M512d) (dst x86.M512d)
- func M512InvsqrtPs(a x86.M512) (dst x86.M512)
- func M512Kand(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
- func M512Kandn(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
- func M512Kmov(a x86.Mmask16) (dst x86.Mmask16)
- func M512Knot(a x86.Mmask16) (dst x86.Mmask16)
- func M512Kor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
- func M512Kortestc(k1 x86.Mmask16, k2 x86.Mmask16) int
- func M512Kortestz(k1 x86.Mmask16, k2 x86.Mmask16) int
- func M512Kunpackb(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
- func M512Kxnor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
- func M512Kxor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
- func M512Log10Pd(a x86.M512d) (dst x86.M512d)
- func M512Log10Ps(a x86.M512) (dst x86.M512)
- func M512Log1pPd(a x86.M512d) (dst x86.M512d)
- func M512Log1pPs(a x86.M512) (dst x86.M512)
- func M512Log2Pd(a x86.M512d) (dst x86.M512d)
- func M512LogPd(a x86.M512d) (dst x86.M512d)
- func M512LogPs(a x86.M512) (dst x86.M512)
- func M512LogbPd(a x86.M512d) (dst x86.M512d)
- func M512LogbPs(a x86.M512) (dst x86.M512)
- func M512Mask2Permutex2varEpi32(a x86.M512i, idx x86.M512i, k x86.Mmask16, b x86.M512i) (dst x86.M512i)
- func M512Mask2Permutex2varEpi64(a x86.M512i, idx x86.M512i, k x86.Mmask8, b x86.M512i) (dst x86.M512i)
- func M512Mask2Permutex2varPd(a x86.M512d, idx x86.M512i, k x86.Mmask8, b x86.M512d) (dst x86.M512d)
- func M512Mask2Permutex2varPs(a x86.M512, idx x86.M512i, k x86.Mmask16, b x86.M512) (dst x86.M512)
- func M512Mask3FmaddsubPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)
- func M512Mask3FmaddsubPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)
- func M512Mask3FmaddsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)
- func M512Mask3FmaddsubRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)
- func M512Mask3FmsubaddPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)
- func M512Mask3FmsubaddPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)
- func M512Mask3FmsubaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)
- func M512Mask3FmsubaddRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)
- func M512MaskAbsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)
- func M512MaskAbsEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)
- func M512MaskAcosPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskAcosPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskAcoshPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskAcoshPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskAddEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskAlignrEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
- func M512MaskAsinPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskAsinPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskAsinhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskAsinhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskAtan2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskAtan2Ps(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskAtanPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskAtanPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskAtanhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskAtanhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskBroadcastF32x4(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)
- func M512MaskBroadcastF64x4(src x86.M512d, k x86.Mmask8, a x86.M256d) (dst x86.M512d)
- func M512MaskBroadcastI32x4(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)
- func M512MaskBroadcastI64x4(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)
- func M512MaskBroadcastdEpi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)
- func M512MaskBroadcastqEpi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
- func M512MaskBroadcastsdPd(src x86.M512d, k x86.Mmask8, a x86.M128d) (dst x86.M512d)
- func M512MaskBroadcastssPs(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)
- func M512MaskCbrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskCbrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskCdfnormPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskCdfnormPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskCdfnorminvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskCdfnorminvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskCeilPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskCeilPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)
- func M512MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)
- func M512MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpltEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
- func M512MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskCompressEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)
- func M512MaskCompressEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)
- func M512MaskCompressPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskCompressPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskCosPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskCosPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskCosdPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskCosdPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskCoshPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskCoshPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskCvtRoundepi32Ps(src x86.M512, k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)
- func M512MaskCvtRoundepu32Ps(src x86.M512, k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)
- func M512MaskCvtRoundpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)
- func M512MaskCvtRoundpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)
- func M512MaskCvtRoundpdPs(src x86.M256, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256)
- func M512MaskCvtRoundphPs(src x86.M512, k x86.Mmask16, a x86.M256i, sae int) (dst x86.M512)
- func M512MaskCvtRoundpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)
- func M512MaskCvtRoundpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)
- func M512MaskCvtRoundpsPd(src x86.M512d, k x86.Mmask8, a x86.M256, sae int) (dst x86.M512d)
- func M512MaskCvtRoundpsPh(src x86.M256i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)
- func M512MaskCvtepi16Epi32(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)
- func M512MaskCvtepi16Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
- func M512MaskCvtepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)
- func M512MaskCvtepi32Epi64(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)
- func M512MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)
- func M512MaskCvtepi32Pd(src x86.M512d, k x86.Mmask8, a x86.M256i) (dst x86.M512d)
- func M512MaskCvtepi32Ps(src x86.M512, k x86.Mmask16, a x86.M512i) (dst x86.M512)
- func M512MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskCvtepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)
- func M512MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskCvtepi8Epi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)
- func M512MaskCvtepi8Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
- func M512MaskCvtepu16Epi32(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)
- func M512MaskCvtepu16Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
- func M512MaskCvtepu32Epi64(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)
- func M512MaskCvtepu32Pd(src x86.M512d, k x86.Mmask8, a x86.M256i) (dst x86.M512d)
- func M512MaskCvtepu32Ps(src x86.M512, k x86.Mmask16, a x86.M512i) (dst x86.M512)
- func M512MaskCvtepu8Epi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)
- func M512MaskCvtepu8Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
- func M512MaskCvtpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)
- func M512MaskCvtpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)
- func M512MaskCvtpdPs(src x86.M256, k x86.Mmask8, a x86.M512d) (dst x86.M256)
- func M512MaskCvtphPs(src x86.M512, k x86.Mmask16, a x86.M256i) (dst x86.M512)
- func M512MaskCvtpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)
- func M512MaskCvtpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)
- func M512MaskCvtpsPd(src x86.M512d, k x86.Mmask8, a x86.M256) (dst x86.M512d)
- func M512MaskCvtpsPh(src x86.M256i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)
- func M512MaskCvtsepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)
- func M512MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)
- func M512MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskCvtsepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)
- func M512MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskCvttRoundpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)
- func M512MaskCvttRoundpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)
- func M512MaskCvttRoundpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)
- func M512MaskCvttRoundpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)
- func M512MaskCvttpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)
- func M512MaskCvttpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)
- func M512MaskCvttpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)
- func M512MaskCvttpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)
- func M512MaskCvtusepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)
- func M512MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)
- func M512MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskCvtusepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)
- func M512MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskDivEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskDivEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskDivPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskDivPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskDivRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskDivRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
- func M512MaskErfPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskErfPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskErfcPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskErfcPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskErfcinvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskErfcinvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskErfinvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskErfinvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskExp10Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskExp10Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskExp2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskExp2Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskExpPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskExpPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskExpandEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)
- func M512MaskExpandEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)
- func M512MaskExpandPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskExpandPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskExpm1Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskExpm1Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskExtractf32x4Ps(src x86.M128, k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M128)
- func M512MaskExtractf64x4Pd(src x86.M256d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M256d)
- func M512MaskExtracti32x4Epi32(src x86.M128i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)
- func M512MaskExtracti64x4Epi64(src x86.M256i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)
- func M512MaskFixupimmPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)
- func M512MaskFixupimmPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)
- func M512MaskFixupimmRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)
- func M512MaskFixupimmRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)
- func M512MaskFloorPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskFloorPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskFmaddsubPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)
- func M512MaskFmaddsubPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)
- func M512MaskFmaddsubRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskFmaddsubRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
- func M512MaskFmsubaddPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)
- func M512MaskFmsubaddPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)
- func M512MaskFmsubaddRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskFmsubaddRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
- func M512MaskHypotPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskHypotPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskInsertf32x4(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)
- func M512MaskInsertf64x4(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)
- func M512MaskInserti32x4(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)
- func M512MaskInserti64x4(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)
- func M512MaskInvsqrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskInvsqrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskLog10Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskLog10Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskLog1pPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskLog1pPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskLog2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskLogPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskLogPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskLogbPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskLogbPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskMaxEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskMaxEpu64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskMaxPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskMaxPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskMaxRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
- func M512MaskMaxRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)
- func M512MaskMinEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskMinEpu64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskMinPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskMinPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskMinRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
- func M512MaskMinRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)
- func M512MaskMovedupPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskMovehdupPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskMoveldupPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskMulEpi32(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskMulEpu32(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskMulloxEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskNearbyintPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskNearbyintPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskPermutePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
- func M512MaskPermutePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
- func M512MaskPermutevarPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512i) (dst x86.M512d)
- func M512MaskPermutevarPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512i) (dst x86.M512)
- func M512MaskPermutex2varEpi32(a x86.M512i, k x86.Mmask16, idx x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskPermutex2varEpi64(a x86.M512i, k x86.Mmask8, idx x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskPermutex2varPd(a x86.M512d, k x86.Mmask8, idx x86.M512i, b x86.M512d) (dst x86.M512d)
- func M512MaskPermutex2varPs(a x86.M512, k x86.Mmask16, idx x86.M512i, b x86.M512) (dst x86.M512)
- func M512MaskPermutexEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskPermutexPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
- func M512MaskPermutexvarEpi32(src x86.M512i, k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)
- func M512MaskPermutexvarEpi64(src x86.M512i, k x86.Mmask8, idx x86.M512i, a x86.M512i) (dst x86.M512i)
- func M512MaskPermutexvarPd(src x86.M512d, k x86.Mmask8, idx x86.M512i, a x86.M512d) (dst x86.M512d)
- func M512MaskPermutexvarPs(src x86.M512, k x86.Mmask16, idx x86.M512i, a x86.M512) (dst x86.M512)
- func M512MaskPowPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskPowPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskRcp14Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskRcp14Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskRecipPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskRecipPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskRemEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskRemEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskRintPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskRintPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskRolEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskRolEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskRolvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskRolvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskRorEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskRorEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskRorvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskRorvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskRoundscalePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
- func M512MaskRoundscalePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
- func M512MaskRoundscaleRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
- func M512MaskRoundscaleRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)
- func M512MaskRsqrt14Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskRsqrt14Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskScalefPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskScalefPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskScalefRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskScalefRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
- func M512MaskSet1Epi32(src x86.M512i, k x86.Mmask16, a int) (dst x86.M512i)
- func M512MaskSet1Epi64(src x86.M512i, k x86.Mmask8, a int64) (dst x86.M512i)
- func M512MaskShuffleF32x4(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
- func M512MaskShuffleF64x2(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
- func M512MaskShuffleI32x4(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskShuffleI64x2(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskShufflePd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
- func M512MaskShufflePs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
- func M512MaskSinPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskSinPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskSincosPd(cos_res *x86.M512d, sin_src x86.M512d, cos_src x86.M512d, k x86.Mmask8, ...) (dst x86.M512d)
- func M512MaskSincosPs(cos_res *x86.M512, sin_src x86.M512, cos_src x86.M512, k x86.Mmask16, ...) (dst x86.M512)
- func M512MaskSindPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskSindPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskSinhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskSinhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskSllEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskSllEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskSlliEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskSllvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512MaskSqrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskSqrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskSqrtRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskSqrtRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)
- func M512MaskSraEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskSraEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskSraiEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskSravEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512MaskSrlEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskSrlEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskSrliEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskSrlvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512MaskSubEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskSvmlRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskTanPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskTanPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskTandPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskTandPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskTanhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskTanhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskTernarylogicEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskTernarylogicEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskTestnEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
- func M512MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512MaskTruncPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskTruncPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskUnpackhiEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskUnpackhiEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskUnpackhiPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskUnpackhiPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskUnpackloEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskUnpackloEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskUnpackloPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskUnpackloPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskzAbsEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)
- func M512MaskzAbsEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)
- func M512MaskzAddEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzAddEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzAddPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskzAddPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskzAddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzAddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
- func M512MaskzAlignrEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
- func M512MaskzAlignrEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
- func M512MaskzAndEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzAndEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzAndnotEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzAndnotEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzBroadcastF32x4(k x86.Mmask16, a x86.M128) (dst x86.M512)
- func M512MaskzBroadcastF64x4(k x86.Mmask8, a x86.M256d) (dst x86.M512d)
- func M512MaskzBroadcastI32x4(k x86.Mmask16, a x86.M128i) (dst x86.M512i)
- func M512MaskzBroadcastI64x4(k x86.Mmask8, a x86.M256i) (dst x86.M512i)
- func M512MaskzBroadcastdEpi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)
- func M512MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
- func M512MaskzBroadcastsdPd(k x86.Mmask8, a x86.M128d) (dst x86.M512d)
- func M512MaskzBroadcastssPs(k x86.Mmask16, a x86.M128) (dst x86.M512)
- func M512MaskzCompressEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)
- func M512MaskzCompressEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)
- func M512MaskzCompressPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskzCompressPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskzCvtRoundepi32Ps(k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)
- func M512MaskzCvtRoundepu32Ps(k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)
- func M512MaskzCvtRoundpdEpi32(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)
- func M512MaskzCvtRoundpdEpu32(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)
- func M512MaskzCvtRoundpdPs(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256)
- func M512MaskzCvtRoundphPs(k x86.Mmask16, a x86.M256i, sae int) (dst x86.M512)
- func M512MaskzCvtRoundpsEpi32(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)
- func M512MaskzCvtRoundpsEpu32(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)
- func M512MaskzCvtRoundpsPd(k x86.Mmask8, a x86.M256, sae int) (dst x86.M512d)
- func M512MaskzCvtRoundpsPh(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)
- func M512MaskzCvtepi16Epi32(k x86.Mmask16, a x86.M256i) (dst x86.M512i)
- func M512MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
- func M512MaskzCvtepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)
- func M512MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M256i) (dst x86.M512i)
- func M512MaskzCvtepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)
- func M512MaskzCvtepi32Pd(k x86.Mmask8, a x86.M256i) (dst x86.M512d)
- func M512MaskzCvtepi32Ps(k x86.Mmask16, a x86.M512i) (dst x86.M512)
- func M512MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)
- func M512MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskzCvtepi8Epi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)
- func M512MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
- func M512MaskzCvtepu16Epi32(k x86.Mmask16, a x86.M256i) (dst x86.M512i)
- func M512MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
- func M512MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M256i) (dst x86.M512i)
- func M512MaskzCvtepu32Pd(k x86.Mmask8, a x86.M256i) (dst x86.M512d)
- func M512MaskzCvtepu32Ps(k x86.Mmask16, a x86.M512i) (dst x86.M512)
- func M512MaskzCvtepu8Epi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)
- func M512MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
- func M512MaskzCvtpdEpi32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)
- func M512MaskzCvtpdEpu32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)
- func M512MaskzCvtpdPs(k x86.Mmask8, a x86.M512d) (dst x86.M256)
- func M512MaskzCvtphPs(k x86.Mmask16, a x86.M256i) (dst x86.M512)
- func M512MaskzCvtpsEpi32(k x86.Mmask16, a x86.M512) (dst x86.M512i)
- func M512MaskzCvtpsEpu32(k x86.Mmask16, a x86.M512) (dst x86.M512i)
- func M512MaskzCvtpsPd(k x86.Mmask8, a x86.M256) (dst x86.M512d)
- func M512MaskzCvtpsPh(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)
- func M512MaskzCvtsepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)
- func M512MaskzCvtsepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)
- func M512MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)
- func M512MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskzCvttRoundpdEpi32(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)
- func M512MaskzCvttRoundpdEpu32(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)
- func M512MaskzCvttRoundpsEpi32(k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)
- func M512MaskzCvttRoundpsEpu32(k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)
- func M512MaskzCvttpdEpi32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)
- func M512MaskzCvttpdEpu32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)
- func M512MaskzCvttpsEpi32(k x86.Mmask16, a x86.M512) (dst x86.M512i)
- func M512MaskzCvttpsEpu32(k x86.Mmask16, a x86.M512) (dst x86.M512i)
- func M512MaskzCvtusepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)
- func M512MaskzCvtusepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)
- func M512MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)
- func M512MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
- func M512MaskzDivPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskzDivPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskzDivRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzDivRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
- func M512MaskzExpandEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)
- func M512MaskzExpandEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)
- func M512MaskzExpandPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskzExpandPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskzExtractf32x4Ps(k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M128)
- func M512MaskzExtractf64x4Pd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M256d)
- func M512MaskzExtracti32x4Epi32(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)
- func M512MaskzExtracti64x4Epi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)
- func M512MaskzFixupimmPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)
- func M512MaskzFixupimmPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)
- func M512MaskzFixupimmRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)
- func M512MaskzFixupimmRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)
- func M512MaskzFmaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
- func M512MaskzFmaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
- func M512MaskzFmaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzFmaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
- func M512MaskzFmaddsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
- func M512MaskzFmaddsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
- func M512MaskzFmaddsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzFmaddsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
- func M512MaskzFmsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
- func M512MaskzFmsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
- func M512MaskzFmsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzFmsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
- func M512MaskzFmsubaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
- func M512MaskzFmsubaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
- func M512MaskzFmsubaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzFmsubaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
- func M512MaskzFnmaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
- func M512MaskzFnmaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
- func M512MaskzFnmaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzFnmaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
- func M512MaskzFnmsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
- func M512MaskzFnmsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
- func M512MaskzFnmsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzFnmsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
- func M512MaskzGetexpPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskzGetexpPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskzGetexpRoundPd(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzGetexpRoundPs(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)
- func M512MaskzGetmantPd(k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512d)
- func M512MaskzGetmantPs(k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512)
- func M512MaskzGetmantRoundPd(k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, ...) (dst x86.M512d)
- func M512MaskzGetmantRoundPs(k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, ...) (dst x86.M512)
- func M512MaskzInsertf32x4(k x86.Mmask16, a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)
- func M512MaskzInsertf64x4(k x86.Mmask8, a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)
- func M512MaskzInserti32x4(k x86.Mmask16, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)
- func M512MaskzInserti64x4(k x86.Mmask8, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)
- func M512MaskzMaxEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzMaxEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzMaxEpu32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzMaxEpu64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzMaxPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskzMaxPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskzMaxRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
- func M512MaskzMaxRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)
- func M512MaskzMinEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzMinEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzMinEpu32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzMinEpu64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzMinPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskzMinPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskzMinRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
- func M512MaskzMinRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)
- func M512MaskzMovEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)
- func M512MaskzMovEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)
- func M512MaskzMovPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskzMovPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskzMovedupPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskzMovehdupPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskzMoveldupPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskzMulEpi32(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzMulEpu32(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzMulPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskzMulPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskzMulRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzMulRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
- func M512MaskzMulloEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzOrEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzOrEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzPermutePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
- func M512MaskzPermutePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
- func M512MaskzPermutevarPd(k x86.Mmask8, a x86.M512d, b x86.M512i) (dst x86.M512d)
- func M512MaskzPermutevarPs(k x86.Mmask16, a x86.M512, b x86.M512i) (dst x86.M512)
- func M512MaskzPermutex2varEpi32(k x86.Mmask16, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzPermutex2varPd(k x86.Mmask8, a x86.M512d, idx x86.M512i, b x86.M512d) (dst x86.M512d)
- func M512MaskzPermutex2varPs(k x86.Mmask16, a x86.M512, idx x86.M512i, b x86.M512) (dst x86.M512)
- func M512MaskzPermutexEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzPermutexPd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
- func M512MaskzPermutexvarEpi32(k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)
- func M512MaskzPermutexvarEpi64(k x86.Mmask8, idx x86.M512i, a x86.M512i) (dst x86.M512i)
- func M512MaskzPermutexvarPd(k x86.Mmask8, idx x86.M512i, a x86.M512d) (dst x86.M512d)
- func M512MaskzPermutexvarPs(k x86.Mmask16, idx x86.M512i, a x86.M512) (dst x86.M512)
- func M512MaskzRcp14Pd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskzRcp14Ps(k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskzRolEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzRolEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzRolvEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzRolvEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzRorEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzRorEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzRorvEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzRorvEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzRoundscalePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
- func M512MaskzRoundscalePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
- func M512MaskzRoundscaleRoundPd(k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
- func M512MaskzRoundscaleRoundPs(k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)
- func M512MaskzRsqrt14Pd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskzRsqrt14Ps(k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskzScalefPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskzScalefPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskzScalefRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzScalefRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
- func M512MaskzSet1Epi32(k x86.Mmask16, a int) (dst x86.M512i)
- func M512MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M512i)
- func M512MaskzShuffleEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzShuffleF32x4(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
- func M512MaskzShuffleF64x2(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
- func M512MaskzShuffleI32x4(k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzShuffleI64x2(k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzShufflePd(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
- func M512MaskzShufflePs(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
- func M512MaskzSllEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskzSllEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskzSlliEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzSlliEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzSllvEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512MaskzSllvEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512MaskzSqrtPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
- func M512MaskzSqrtPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
- func M512MaskzSqrtRoundPd(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzSqrtRoundPs(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)
- func M512MaskzSraEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskzSraEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskzSraiEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzSraiEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzSravEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512MaskzSravEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512MaskzSrlEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskzSrlEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512MaskzSrliEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzSrliEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzSrlvEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512MaskzSrlvEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512MaskzSubEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzSubEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzSubPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskzSubPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskzSubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
- func M512MaskzSubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
- func M512MaskzTernarylogicEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)
- func M512MaskzUnpackhiEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzUnpackhiPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskzUnpackhiPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskzUnpackloEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzUnpackloEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzUnpackloPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaskzUnpackloPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaskzXorEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaskzXorEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaxEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaxEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MaxPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MaxPs(a x86.M512, b x86.M512) (dst x86.M512)
- func M512MaxRoundPd(a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
- func M512MaxRoundPs(a x86.M512, b x86.M512, sae int) (dst x86.M512)
- func M512MinEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MinEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MinPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512MinPs(a x86.M512, b x86.M512) (dst x86.M512)
- func M512MinRoundPd(a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
- func M512MinRoundPs(a x86.M512, b x86.M512, sae int) (dst x86.M512)
- func M512MovedupPd(a x86.M512d) (dst x86.M512d)
- func M512MovehdupPs(a x86.M512) (dst x86.M512)
- func M512MoveldupPs(a x86.M512) (dst x86.M512)
- func M512MulEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MulEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512MulloxEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512NearbyintPd(a x86.M512d) (dst x86.M512d)
- func M512NearbyintPs(a x86.M512) (dst x86.M512)
- func M512PermutePd(a x86.M512d, imm8 byte) (dst x86.M512d)
- func M512PermutePs(a x86.M512, imm8 byte) (dst x86.M512)
- func M512PermutevarPd(a x86.M512d, b x86.M512i) (dst x86.M512d)
- func M512PermutevarPs(a x86.M512, b x86.M512i) (dst x86.M512)
- func M512Permutex2varEpi32(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512Permutex2varEpi64(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512Permutex2varPd(a x86.M512d, idx x86.M512i, b x86.M512d) (dst x86.M512d)
- func M512Permutex2varPs(a x86.M512, idx x86.M512i, b x86.M512) (dst x86.M512)
- func M512PermutexEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512PermutexPd(a x86.M512d, imm8 byte) (dst x86.M512d)
- func M512PermutexvarEpi32(idx x86.M512i, a x86.M512i) (dst x86.M512i)
- func M512PermutexvarEpi64(idx x86.M512i, a x86.M512i) (dst x86.M512i)
- func M512PermutexvarPd(idx x86.M512i, a x86.M512d) (dst x86.M512d)
- func M512PermutexvarPs(idx x86.M512i, a x86.M512) (dst x86.M512)
- func M512PowPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512PowPs(a x86.M512, b x86.M512) (dst x86.M512)
- func M512Rcp14Pd(a x86.M512d) (dst x86.M512d)
- func M512Rcp14Ps(a x86.M512) (dst x86.M512)
- func M512RecipPd(a x86.M512d) (dst x86.M512d)
- func M512RecipPs(a x86.M512) (dst x86.M512)
- func M512RemEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RemEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RemEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RemEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RemEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RemEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RemEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RemEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RintPd(a x86.M512d) (dst x86.M512d)
- func M512RintPs(a x86.M512) (dst x86.M512)
- func M512RolEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512RolEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512RolvEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RolvEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RorEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512RorEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512RorvEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RorvEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512RoundscalePd(a x86.M512d, imm8 byte) (dst x86.M512d)
- func M512RoundscalePs(a x86.M512, imm8 byte) (dst x86.M512)
- func M512RoundscaleRoundPd(a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
- func M512RoundscaleRoundPs(a x86.M512, imm8 byte, rounding int) (dst x86.M512)
- func M512Rsqrt14Pd(a x86.M512d) (dst x86.M512d)
- func M512Rsqrt14Ps(a x86.M512) (dst x86.M512)
- func M512ScalefPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512ScalefPs(a x86.M512, b x86.M512) (dst x86.M512)
- func M512ScalefRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
- func M512ScalefRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)
- func M512Set1Epi16(a int16) (dst x86.M512i)
- func M512Set1Epi32(a int) (dst x86.M512i)
- func M512Set1Epi64(a int64) (dst x86.M512i)
- func M512Set1Epi8(a byte) (dst x86.M512i)
- func M512Set1Pd(a float64) (dst x86.M512d)
- func M512Set1Ps(a float32) (dst x86.M512)
- func M512Set4Epi32(d int, c int, b int, a int) (dst x86.M512i)
- func M512Set4Epi64(d int64, c int64, b int64, a int64) (dst x86.M512i)
- func M512Set4Pd(d float64, c float64, b float64, a float64) (dst x86.M512d)
- func M512Set4Ps(d float32, c float32, b float32, a float32) (dst x86.M512)
- func M512SetEpi32(e15 int, e14 int, e13 int, e12 int, e11 int, e10 int, e9 int, e8 int, e7 int, ...) (dst x86.M512i)
- func M512SetEpi64(e7 int64, e6 int64, e5 int64, e4 int64, e3 int64, e2 int64, e1 int64, e0 int64) (dst x86.M512i)
- func M512SetPd(e7 float64, e6 float64, e5 float64, e4 float64, e3 float64, e2 float64, ...) (dst x86.M512d)
- func M512SetPs(e15 float32, e14 float32, e13 float32, e12 float32, e11 float32, e10 float32, ...) (dst x86.M512)
- func M512Setr4Epi32(d int, c int, b int, a int) (dst x86.M512i)
- func M512Setr4Epi64(d int64, c int64, b int64, a int64) (dst x86.M512i)
- func M512Setr4Pd(d float64, c float64, b float64, a float64) (dst x86.M512d)
- func M512Setr4Ps(d float32, c float32, b float32, a float32) (dst x86.M512)
- func M512SetrEpi32(e15 int, e14 int, e13 int, e12 int, e11 int, e10 int, e9 int, e8 int, e7 int, ...) (dst x86.M512i)
- func M512SetrEpi64(e7 int64, e6 int64, e5 int64, e4 int64, e3 int64, e2 int64, e1 int64, e0 int64) (dst x86.M512i)
- func M512SetrPd(e7 float64, e6 float64, e5 float64, e4 float64, e3 float64, e2 float64, ...) (dst x86.M512d)
- func M512SetrPs(e15 float32, e14 float32, e13 float32, e12 float32, e11 float32, e10 float32, ...) (dst x86.M512)
- func M512Setzero() (dst x86.M512)
- func M512SetzeroEpi32() (dst x86.M512i)
- func M512SetzeroPd() (dst x86.M512d)
- func M512SetzeroPs() (dst x86.M512)
- func M512SetzeroSi512() (dst x86.M512i)
- func M512ShuffleF32x4(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
- func M512ShuffleF64x2(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
- func M512ShuffleI32x4(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
- func M512ShuffleI64x2(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
- func M512ShufflePd(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
- func M512ShufflePs(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
- func M512SinPd(a x86.M512d) (dst x86.M512d)
- func M512SinPs(a x86.M512) (dst x86.M512)
- func M512SincosPd(cos_res *x86.M512d, a x86.M512d) (dst x86.M512d)
- func M512SincosPs(cos_res *x86.M512, a x86.M512) (dst x86.M512)
- func M512SindPd(a x86.M512d) (dst x86.M512d)
- func M512SindPs(a x86.M512) (dst x86.M512)
- func M512SinhPd(a x86.M512d) (dst x86.M512d)
- func M512SinhPs(a x86.M512) (dst x86.M512)
- func M512SllEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512SllEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512SlliEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512SllvEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512SqrtPd(a x86.M512d) (dst x86.M512d)
- func M512SqrtPs(a x86.M512) (dst x86.M512)
- func M512SqrtRoundPd(a x86.M512d, rounding int) (dst x86.M512d)
- func M512SqrtRoundPs(a x86.M512, rounding int) (dst x86.M512)
- func M512SraEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512SraEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512SraiEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512SravEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512SrlEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512SrlEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)
- func M512SrliEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
- func M512SrlvEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)
- func M512SubEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512SvmlRoundPd(a x86.M512d) (dst x86.M512d)
- func M512TanPd(a x86.M512d) (dst x86.M512d)
- func M512TanPs(a x86.M512) (dst x86.M512)
- func M512TandPd(a x86.M512d) (dst x86.M512d)
- func M512TandPs(a x86.M512) (dst x86.M512)
- func M512TanhPd(a x86.M512d) (dst x86.M512d)
- func M512TanhPs(a x86.M512) (dst x86.M512)
- func M512TernarylogicEpi32(a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)
- func M512TernarylogicEpi64(a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)
- func M512TestEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512TestnEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
- func M512TestnEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
- func M512TruncPd(a x86.M512d) (dst x86.M512d)
- func M512TruncPs(a x86.M512) (dst x86.M512)
- func M512Undefined() (dst x86.M512)
- func M512UndefinedEpi32() (dst x86.M512i)
- func M512UndefinedPd() (dst x86.M512d)
- func M512UndefinedPs() (dst x86.M512)
- func M512UnpackhiEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512UnpackhiEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512UnpackhiPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512UnpackhiPs(a x86.M512, b x86.M512) (dst x86.M512)
- func M512UnpackloEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512UnpackloEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
- func M512UnpackloPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
- func M512UnpackloPs(a x86.M512, b x86.M512) (dst x86.M512)
- func Mask2Permutex2varEpi32(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)
- func Mask2Permutex2varEpi64(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)
- func Mask2Permutex2varPd(a x86.M128d, idx x86.M128i, k x86.Mmask8, b x86.M128d) (dst x86.M128d)
- func Mask2Permutex2varPs(a x86.M128, idx x86.M128i, k x86.Mmask8, b x86.M128) (dst x86.M128)
- func Mask3FmaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
- func Mask3FmaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
- func Mask3FmaddRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
- func Mask3FmaddRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
- func Mask3FmaddSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
- func Mask3FmaddSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
- func Mask3FmaddsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
- func Mask3FmaddsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
- func Mask3FmsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
- func Mask3FmsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
- func Mask3FmsubRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
- func Mask3FmsubRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
- func Mask3FmsubSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
- func Mask3FmsubSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
- func Mask3FmsubaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
- func Mask3FmsubaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
- func Mask3FnmaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
- func Mask3FnmaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
- func Mask3FnmaddRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
- func Mask3FnmaddRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
- func Mask3FnmaddSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
- func Mask3FnmaddSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
- func Mask3FnmsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
- func Mask3FnmsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
- func Mask3FnmsubRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
- func Mask3FnmsubRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
- func Mask3FnmsubSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
- func Mask3FnmsubSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
- func MaskAbsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskAbsEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskAddEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskAddEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskAddRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskAddRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskAddSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskAddSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskAndEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskAndEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskAndnotEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskAndnotEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskBlendEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskBlendEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskBlendPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskBlendPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskBroadcastdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskBroadcastqEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskBroadcastssPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskCmpEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
- func MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
- func MaskCmpEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
- func MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
- func MaskCmpPdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)
- func MaskCmpPsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)
- func MaskCmpRoundSdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, sae int) (dst x86.Mmask8)
- func MaskCmpRoundSsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, sae int) (dst x86.Mmask8)
- func MaskCmpSdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)
- func MaskCmpSsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)
- func MaskCmpeqEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpeqEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpgeEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpgeEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpgtEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpgtEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpleEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpleEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpltEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpltEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpneqEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpneqEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskCompressEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCompressEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCompressPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskCompressPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskCvtRoundpsPh(src x86.M128i, k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)
- func MaskCvtRoundsdSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128d, rounding int) (dst x86.M128)
- func MaskCvtRoundssSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)
- func MaskCvtepi16Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepi16Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepi32Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepi32Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)
- func MaskCvtepi32Ps(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)
- func MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepi8Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepi8Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepu16Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepu16Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepu32Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepu32Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)
- func MaskCvtepu8Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtepu8Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
- func MaskCvtpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
- func MaskCvtpdPs(src x86.M128, k x86.Mmask8, a x86.M128d) (dst x86.M128)
- func MaskCvtphPs(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)
- func MaskCvtpsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
- func MaskCvtpsEpu32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
- func MaskCvtpsPh(src x86.M128i, k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)
- func MaskCvtsdSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128d) (dst x86.M128)
- func MaskCvtsepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtsepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtssSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128) (dst x86.M128d)
- func MaskCvttpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
- func MaskCvttpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
- func MaskCvttpsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
- func MaskCvttpsEpu32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
- func MaskCvtusepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtusepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskDivPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskDivPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskDivRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskDivRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskDivSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskDivSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskExpandEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskExpandEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskExpandPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskExpandPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskFixupimmPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
- func MaskFixupimmPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
- func MaskFixupimmRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)
- func MaskFixupimmRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)
- func MaskFixupimmSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
- func MaskFixupimmSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
- func MaskFmaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskFmaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskFmaddRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
- func MaskFmaddRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
- func MaskFmaddSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskFmaddSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskFmaddsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskFmaddsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskFmsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskFmsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskFmsubRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
- func MaskFmsubRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
- func MaskFmsubSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskFmsubSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskFmsubaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskFmsubaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskFnmaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskFnmaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskFnmaddRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
- func MaskFnmaddRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
- func MaskFnmaddSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskFnmaddSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskFnmsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskFnmsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskFnmsubRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
- func MaskFnmsubRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
- func MaskFnmsubSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskFnmsubSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskGetexpPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskGetexpPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskGetexpRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskGetexpRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskGetexpSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskGetexpSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskGetmantPd(src x86.M128d, k x86.Mmask8, a x86.M128d, interv MMMANTISSANORMENUM, ...) (dst x86.M128d)
- func MaskGetmantPs(src x86.M128, k x86.Mmask8, a x86.M128, interv MMMANTISSANORMENUM, ...) (dst x86.M128)
- func MaskGetmantRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, ...) (dst x86.M128d)
- func MaskGetmantRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, ...) (dst x86.M128)
- func MaskGetmantSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, ...) (dst x86.M128d)
- func MaskGetmantSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, ...) (dst x86.M128)
- func MaskLoadSd(src x86.M128d, k x86.Mmask8, mem_addr *float64) (dst x86.M128d)
- func MaskLoadSs(src x86.M128, k x86.Mmask8, mem_addr *float32) (dst x86.M128)
- func MaskMaxEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskMaxEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskMaxEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskMaxEpu64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskMaxPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskMaxPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskMaxRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
- func MaskMaxRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)
- func MaskMaxSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskMaxSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskMinEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskMinEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskMinEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskMinEpu64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskMinPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskMinPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskMinRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
- func MaskMinRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)
- func MaskMinSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskMinSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskMovEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskMovEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskMovPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskMovPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskMoveSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskMoveSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskMovedupPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskMovehdupPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskMoveldupPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskMulEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskMulEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskMulPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskMulPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskMulRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskMulRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskMulSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskMulSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskMulloEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskOrEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskOrEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskPermutePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)
- func MaskPermutePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)
- func MaskPermutevarPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128i) (dst x86.M128d)
- func MaskPermutevarPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128i) (dst x86.M128)
- func MaskPermutex2varEpi32(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskPermutex2varEpi64(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskPermutex2varPd(a x86.M128d, k x86.Mmask8, idx x86.M128i, b x86.M128d) (dst x86.M128d)
- func MaskPermutex2varPs(a x86.M128, k x86.Mmask8, idx x86.M128i, b x86.M128) (dst x86.M128)
- func MaskRcp14Pd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskRcp14Ps(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskRcp14Sd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskRcp14Ss(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskRolEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskRolEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskRolvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskRolvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskRorEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskRorEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskRorvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskRorvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskRoundscalePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)
- func MaskRoundscalePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)
- func MaskRoundscaleRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
- func MaskRoundscaleRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
- func MaskRoundscaleSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
- func MaskRoundscaleSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
- func MaskRsqrt14Pd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskRsqrt14Ps(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskRsqrt14Sd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskRsqrt14Ss(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskScalefPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskScalefPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskScalefRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskScalefRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskScalefSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskScalefSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskSet1Epi32(src x86.M128i, k x86.Mmask8, a int) (dst x86.M128i)
- func MaskSet1Epi64(src x86.M128i, k x86.Mmask8, a int64) (dst x86.M128i)
- func MaskShuffleEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskShufflePd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
- func MaskShufflePs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
- func MaskSllEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSllEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSlliEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskSlliEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskSllvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSllvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSqrtPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskSqrtPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskSqrtRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskSqrtRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskSqrtSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskSqrtSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskSraEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSraEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSraiEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskSraiEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskSravEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSravEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSrlEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSrlEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSrliEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskSrliEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskSrlvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskSrlvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskStoreSd(mem_addr *float64, k x86.Mmask8, a x86.M128d)
- func MaskStoreSs(mem_addr *float32, k x86.Mmask8, a x86.M128)
- func MaskSubEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskSubEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskSubPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskSubPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskSubRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskSubRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskSubSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskSubSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskTernarylogicEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskTernarylogicEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskTestEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskTestnEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func MaskUnpackhiEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskUnpackhiEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskUnpackhiPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskUnpackhiPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskUnpackloEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskUnpackloEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskUnpackloPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskUnpackloPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskXorEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskXorEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzAbsEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzAbsEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzAddEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzAddEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzAddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskzAddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskzAddSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzAddSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzAndEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzAndEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzAndnotEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzAndnotEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzBroadcastdEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzBroadcastssPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskzCompressEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCompressEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCompressPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskzCompressPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskzCvtRoundpsPh(k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)
- func MaskzCvtRoundsdSs(k x86.Mmask8, a x86.M128, b x86.M128d, rounding int) (dst x86.M128)
- func MaskzCvtRoundssSd(k x86.Mmask8, a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)
- func MaskzCvtepi16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepi32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)
- func MaskzCvtepi32Ps(k x86.Mmask8, a x86.M128i) (dst x86.M128)
- func MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepi8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepu16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepu32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)
- func MaskzCvtepu8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtpdEpi32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
- func MaskzCvtpdEpu32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
- func MaskzCvtpdPs(k x86.Mmask8, a x86.M128d) (dst x86.M128)
- func MaskzCvtphPs(k x86.Mmask8, a x86.M128i) (dst x86.M128)
- func MaskzCvtpsEpi32(k x86.Mmask8, a x86.M128) (dst x86.M128i)
- func MaskzCvtpsEpu32(k x86.Mmask8, a x86.M128) (dst x86.M128i)
- func MaskzCvtpsPh(k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)
- func MaskzCvtsdSs(k x86.Mmask8, a x86.M128, b x86.M128d) (dst x86.M128)
- func MaskzCvtsepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtsepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtssSd(k x86.Mmask8, a x86.M128d, b x86.M128) (dst x86.M128d)
- func MaskzCvttpdEpi32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
- func MaskzCvttpdEpu32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
- func MaskzCvttpsEpi32(k x86.Mmask8, a x86.M128) (dst x86.M128i)
- func MaskzCvttpsEpu32(k x86.Mmask8, a x86.M128) (dst x86.M128i)
- func MaskzCvtusepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtusepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzDivPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzDivPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzDivRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskzDivRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskzDivSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzDivSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzExpandEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzExpandEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzExpandPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskzExpandPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskzFixupimmPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
- func MaskzFixupimmPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
- func MaskzFixupimmRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)
- func MaskzFixupimmRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)
- func MaskzFixupimmSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
- func MaskzFixupimmSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
- func MaskzFmaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskzFmaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskzFmaddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
- func MaskzFmaddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
- func MaskzFmaddSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskzFmaddSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskzFmaddsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskzFmaddsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskzFmsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskzFmsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskzFmsubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
- func MaskzFmsubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
- func MaskzFmsubSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskzFmsubSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskzFmsubaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskzFmsubaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskzFnmaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskzFnmaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskzFnmaddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
- func MaskzFnmaddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
- func MaskzFnmaddSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskzFnmaddSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskzFnmsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskzFnmsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskzFnmsubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
- func MaskzFnmsubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
- func MaskzFnmsubSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
- func MaskzFnmsubSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
- func MaskzGetexpPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskzGetexpPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskzGetexpRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskzGetexpRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskzGetexpSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzGetexpSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzGetmantPd(k x86.Mmask8, a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
- func MaskzGetmantPs(k x86.Mmask8, a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
- func MaskzGetmantRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, ...) (dst x86.M128d)
- func MaskzGetmantRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, ...) (dst x86.M128)
- func MaskzGetmantSd(k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, ...) (dst x86.M128d)
- func MaskzGetmantSs(k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, ...) (dst x86.M128)
- func MaskzLoadSd(k x86.Mmask8, mem_addr *float64) (dst x86.M128d)
- func MaskzLoadSs(k x86.Mmask8, mem_addr *float32) (dst x86.M128)
- func MaskzMaxEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzMaxEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzMaxEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzMaxEpu64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzMaxPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzMaxPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzMaxRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
- func MaskzMaxRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)
- func MaskzMaxSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzMaxSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzMinEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzMinEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzMinEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzMinEpu64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzMinPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzMinPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzMinRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
- func MaskzMinRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)
- func MaskzMinSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzMinSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzMovEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzMovEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
- func MaskzMovPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskzMovPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskzMoveSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzMoveSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzMovedupPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskzMovehdupPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskzMoveldupPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskzMulEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzMulEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzMulPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzMulPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzMulRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskzMulRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskzMulSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzMulSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzMulloEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzOrEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzOrEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzPermutePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)
- func MaskzPermutePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)
- func MaskzPermutevarPd(k x86.Mmask8, a x86.M128d, b x86.M128i) (dst x86.M128d)
- func MaskzPermutevarPs(k x86.Mmask8, a x86.M128, b x86.M128i) (dst x86.M128)
- func MaskzPermutex2varEpi32(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzPermutex2varPd(k x86.Mmask8, a x86.M128d, idx x86.M128i, b x86.M128d) (dst x86.M128d)
- func MaskzPermutex2varPs(k x86.Mmask8, a x86.M128, idx x86.M128i, b x86.M128) (dst x86.M128)
- func MaskzRcp14Pd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskzRcp14Ps(k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskzRcp14Sd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzRcp14Ss(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzRolEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzRolEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzRolvEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzRolvEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzRorEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzRorEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzRorvEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzRorvEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzRoundscalePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)
- func MaskzRoundscalePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)
- func MaskzRoundscaleRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
- func MaskzRoundscaleRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
- func MaskzRoundscaleSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
- func MaskzRoundscaleSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
- func MaskzRsqrt14Pd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskzRsqrt14Ps(k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskzRsqrt14Sd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzRsqrt14Ss(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzScalefPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzScalefPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzScalefRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskzScalefRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskzScalefSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzScalefSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzSet1Epi32(k x86.Mmask8, a int) (dst x86.M128i)
- func MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M128i)
- func MaskzShuffleEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzShufflePd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
- func MaskzShufflePs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
- func MaskzSllEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSllEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSlliEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzSlliEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzSllvEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSllvEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSqrtPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
- func MaskzSqrtPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
- func MaskzSqrtRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskzSqrtRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskzSqrtSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzSqrtSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzSraEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSraEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSraiEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzSraiEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzSravEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSravEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSrlEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSrlEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSrliEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzSrliEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzSrlvEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSrlvEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
- func MaskzSubEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzSubEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzSubPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzSubPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzSubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MaskzSubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func MaskzSubSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzSubSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzTernarylogicEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)
- func MaskzUnpackhiEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzUnpackhiPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzUnpackhiPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzUnpackloEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzUnpackloEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzUnpackloPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
- func MaskzUnpackloPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
- func MaskzXorEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaskzXorEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaxEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaxEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MaxRoundSd(a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
- func MaxRoundSs(a x86.M128, b x86.M128, sae int) (dst x86.M128)
- func MinEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MinEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)
- func MinRoundSd(a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
- func MinRoundSs(a x86.M128, b x86.M128, sae int) (dst x86.M128)
- func MulRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func MulRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func Permutex2varEpi32(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)
- func Permutex2varEpi64(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)
- func Permutex2varPd(a x86.M128d, idx x86.M128i, b x86.M128d) (dst x86.M128d)
- func Permutex2varPs(a x86.M128, idx x86.M128i, b x86.M128) (dst x86.M128)
- func Rcp14Pd(a x86.M128d) (dst x86.M128d)
- func Rcp14Ps(a x86.M128) (dst x86.M128)
- func Rcp14Sd(a x86.M128d, b x86.M128d) (dst x86.M128d)
- func Rcp14Ss(a x86.M128, b x86.M128) (dst x86.M128)
- func RolEpi32(a x86.M128i, imm8 byte) (dst x86.M128i)
- func RolEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)
- func RolvEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)
- func RolvEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
- func RorEpi32(a x86.M128i, imm8 byte) (dst x86.M128i)
- func RorEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)
- func RorvEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)
- func RorvEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
- func RoundscalePd(a x86.M128d, imm8 byte) (dst x86.M128d)
- func RoundscalePs(a x86.M128, imm8 byte) (dst x86.M128)
- func RoundscaleRoundSd(a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
- func RoundscaleRoundSs(a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
- func RoundscaleSd(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
- func RoundscaleSs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
- func Rsqrt14Sd(a x86.M128d, b x86.M128d) (dst x86.M128d)
- func Rsqrt14Ss(a x86.M128, b x86.M128) (dst x86.M128)
- func ScalefPd(a x86.M128d, b x86.M128d) (dst x86.M128d)
- func ScalefPs(a x86.M128, b x86.M128) (dst x86.M128)
- func ScalefRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func ScalefRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func ScalefSd(a x86.M128d, b x86.M128d) (dst x86.M128d)
- func ScalefSs(a x86.M128, b x86.M128) (dst x86.M128)
- func SqrtRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func SqrtRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func SraEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)
- func SraiEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)
- func SravEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)
- func SubRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
- func SubRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
- func TernarylogicEpi32(a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)
- func TernarylogicEpi64(a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)
- func TestEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func TestEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func TestnEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
- func TestnEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AbsEpi64 ¶
AbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst'.
FOR j := 0 to 1 i := j*64 dst[i+63:i] := ABS(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPABSQ'. Intrinsic: '_mm_abs_epi64'. Requires AVX512F.
func AddRoundSd ¶
AddRoundSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := a[63:0] + b[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VADDSD'. Intrinsic: '_mm_add_round_sd'. Requires AVX512F.
func AddRoundSs ¶
AddRoundSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := a[31:0] + b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VADDSS'. Intrinsic: '_mm_add_round_ss'. Requires AVX512F.
func CmpEpi32Mask ¶
CmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_cmp_epi32_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CmpEpi64Mask ¶
CmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmp_epi64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CmpEpu32Mask ¶
CmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmp_epu32_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CmpEpu64Mask ¶
CmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmp_epu64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CmpPdMask ¶
CmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VCMPPD'. Intrinsic: '_mm_cmp_pd_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CmpPsMask ¶
CmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VCMPPS'. Intrinsic: '_mm_cmp_ps_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CmpRoundSdMask ¶
CmpRoundSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 k[MAX:1] := 0
Instruction: 'VCMPSD'. Intrinsic: '_mm_cmp_round_sd_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CmpRoundSsMask ¶
CmpRoundSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 k[MAX:1] := 0
Instruction: 'VCMPSS'. Intrinsic: '_mm_cmp_round_ss_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CmpSdMask ¶
CmpSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 k[MAX:1] := 0
Instruction: 'VCMPSD'. Intrinsic: '_mm_cmp_sd_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CmpSsMask ¶
CmpSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 k[MAX:1] := 0
Instruction: 'VCMPSS'. Intrinsic: '_mm_cmp_ss_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CmpeqEpi32Mask ¶
CmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpeq_epi32_mask'. Requires AVX512F.
func CmpeqEpi64Mask ¶
CmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpeq_epi64_mask'. Requires AVX512F.
func CmpeqEpu32Mask ¶
CmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpeq_epu32_mask'. Requires AVX512F.
func CmpeqEpu64Mask ¶
CmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpeq_epu64_mask'. Requires AVX512F.
func CmpgeEpi32Mask ¶
CmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpge_epi32_mask'. Requires AVX512F.
func CmpgeEpi64Mask ¶
CmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpge_epi64_mask'. Requires AVX512F.
func CmpgeEpu32Mask ¶
CmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpge_epu32_mask'. Requires AVX512F.
func CmpgeEpu64Mask ¶
CmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpge_epu64_mask'. Requires AVX512F.
func CmpgtEpi32Mask ¶
CmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpgt_epi32_mask'. Requires AVX512F.
func CmpgtEpi64Mask ¶
CmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpgt_epi64_mask'. Requires AVX512F.
func CmpgtEpu32Mask ¶
CmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpgt_epu32_mask'. Requires AVX512F.
func CmpgtEpu64Mask ¶
CmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpgt_epu64_mask'. Requires AVX512F.
func CmpleEpi32Mask ¶
CmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_cmple_epi32_mask'. Requires AVX512F.
func CmpleEpi64Mask ¶
CmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmple_epi64_mask'. Requires AVX512F.
func CmpleEpu32Mask ¶
CmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmple_epu32_mask'. Requires AVX512F.
func CmpleEpu64Mask ¶
CmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmple_epu64_mask'. Requires AVX512F.
func CmpltEpi32Mask ¶
CmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_cmplt_epi32_mask'. Requires AVX512F.
func CmpltEpi64Mask ¶
CmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmplt_epi64_mask'. Requires AVX512F.
func CmpltEpu32Mask ¶
CmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmplt_epu32_mask'. Requires AVX512F.
func CmpltEpu64Mask ¶
CmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmplt_epu64_mask'. Requires AVX512F.
func CmpneqEpi32Mask ¶
CmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpneq_epi32_mask'. Requires AVX512F.
func CmpneqEpi64Mask ¶
CmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpneq_epi64_mask'. Requires AVX512F.
func CmpneqEpu32Mask ¶
CmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*32 k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpneq_epu32_mask'. Requires AVX512F.
func CmpneqEpu64Mask ¶
CmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.
FOR j := 0 to 1 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpneq_epu64_mask'. Requires AVX512F.
func ComiRoundSd ¶
ComiRoundSd: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and return the boolean result (0 or 1).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0
Instruction: 'VCOMISD'. Intrinsic: '_mm_comi_round_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func ComiRoundSs ¶
ComiRoundSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and return the boolean result (0 or 1).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0
Instruction: 'VCOMISS'. Intrinsic: '_mm_comi_round_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func CvtRoundi32Ss ¶
CvtRoundi32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_Int32_To_FP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundi32_ss'. Requires AVX512F.
func CvtRoundi64Sd ¶
CvtRoundi64Sd: Convert the 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_Int64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvt_roundi64_sd'. Requires AVX512F.
func CvtRoundi64Ss ¶
CvtRoundi64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_Int64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundi64_ss'. Requires AVX512F.
func CvtRoundsdI32 ¶
CvtRoundsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP64_To_Int32(a[63:0])
Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_i32'. Requires AVX512F.
func CvtRoundsdI64 ¶
CvtRoundsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP64_To_Int64(a[63:0])
Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_i64'. Requires AVX512F.
func CvtRoundsdSi32 ¶
CvtRoundsdSi32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP64_To_Int32(a[63:0])
Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_si32'. Requires AVX512F.
func CvtRoundsdSi64 ¶
CvtRoundsdSi64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP64_To_Int64(a[63:0])
Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_si64'. Requires AVX512F.
func CvtRoundsdSs ¶
CvtRoundsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP64_To_FP32(b[63:0]) dst[127:32] := a[127:31] dst[MAX:64] := 0
Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_cvt_roundsd_ss'. Requires AVX512F.
func CvtRoundsdU32 ¶
CvtRoundsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])
Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvt_roundsd_u32'. Requires AVX512F.
func CvtRoundsdU64 ¶
CvtRoundsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])
Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvt_roundsd_u64'. Requires AVX512F.
func CvtRoundsi32Ss ¶
CvtRoundsi32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_Int32_To_FP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundsi32_ss'. Requires AVX512F.
func CvtRoundsi64Sd ¶
CvtRoundsi64Sd: Convert the 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_Int64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvt_roundsi64_sd'. Requires AVX512F.
func CvtRoundsi64Ss ¶
CvtRoundsi64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_Int64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundsi64_ss'. Requires AVX512F.
func CvtRoundssI32 ¶
CvtRoundssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP32_To_Int32(a[31:0])
Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_i32'. Requires AVX512F.
func CvtRoundssI64 ¶
CvtRoundssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP32_To_Int64(a[31:0])
Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_i64'. Requires AVX512F.
func CvtRoundssSd ¶
CvtRoundssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP32_To_FP64(b[31:0]) dst[127:64] := a[127:64] dst[MAX:64] := 0
Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_cvt_roundss_sd'. Requires AVX512F.
func CvtRoundssSi32 ¶
CvtRoundssSi32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP32_To_Int32(a[31:0])
Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_si32'. Requires AVX512F.
func CvtRoundssSi64 ¶
CvtRoundssSi64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP32_To_Int64(a[31:0])
Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_si64'. Requires AVX512F.
func CvtRoundssU32 ¶
CvtRoundssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])
Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvt_roundss_u32'. Requires AVX512F.
func CvtRoundssU64 ¶
CvtRoundssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])
Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvt_roundss_u64'. Requires AVX512F.
func CvtRoundu32Ss ¶
CvtRoundu32Ss: Convert the unsigned 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvt_roundu32_ss'. Requires AVX512F.
func CvtRoundu64Sd ¶
CvtRoundu64Sd: Convert the unsigned 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VCVTUSI2SD'. Intrinsic: '_mm_cvt_roundu64_sd'. Requires AVX512F.
func CvtRoundu64Ss ¶
CvtRoundu64Ss: Convert the unsigned 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvt_roundu64_ss'. Requires AVX512F.
func Cvtepi32Epi16 ¶
Cvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 3 i := 32*j k := 16*j dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVDW'. Intrinsic: '_mm_cvtepi32_epi16'. Requires AVX512F.
func Cvtepi32Epi8 ¶
Cvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 3 i := 32*j k := 8*j dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVDB'. Intrinsic: '_mm_cvtepi32_epi8'. Requires AVX512F.
func Cvtepi64Epi16 ¶
Cvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 1 i := 64*j k := 16*j dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVQW'. Intrinsic: '_mm_cvtepi64_epi16'. Requires AVX512F.
func Cvtepi64Epi32 ¶
Cvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 1 i := 64*j k := 32*j dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVQD'. Intrinsic: '_mm_cvtepi64_epi32'. Requires AVX512F.
func Cvtepi64Epi8 ¶
Cvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 1 i := 64*j k := 8*j dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQB'. Intrinsic: '_mm_cvtepi64_epi8'. Requires AVX512F.
func Cvtepu32Pd ¶
Cvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.
FOR j := 0 to 1 i := j*64 l := j*32 dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm_cvtepu32_pd'. Requires AVX512F.
func Cvti32Sd ¶
Cvti32Sd: Convert the 32-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
dst[63:0] := Convert_Int32_To_FP64(b[31:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvti32_sd'. Requires AVX512F.
func Cvti32Ss ¶
Cvti32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
dst[31:0] := Convert_Int32_To_FP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvti32_ss'. Requires AVX512F.
func Cvti64Sd ¶
Cvti64Sd: Convert the 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
dst[63:0] := Convert_Int64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvti64_sd'. Requires AVX512F.
func Cvti64Ss ¶
Cvti64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
dst[31:0] := Convert_Int64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvti64_ss'. Requires AVX512F.
func CvtpdEpu32 ¶
CvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 1 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k]) ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm_cvtpd_epu32'. Requires AVX512F.
func CvtpsEpu32 ¶
CvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 3 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm_cvtps_epu32'. Requires AVX512F.
func CvtsdI32 ¶
CvtsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.
dst[31:0] := Convert_FP64_To_Int32(a[63:0])
Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvtsd_i32'. Requires AVX512F.
func CvtsdI64 ¶
CvtsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.
dst[63:0] := Convert_FP64_To_Int64(a[63:0])
Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvtsd_i64'. Requires AVX512F.
func CvtsdU32 ¶
CvtsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.
dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])
Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvtsd_u32'. Requires AVX512F.
func CvtsdU64 ¶
CvtsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.
dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])
Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvtsd_u64'. Requires AVX512F.
func Cvtsepi32Epi16 ¶
Cvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 3 i := 32*j k := 16*j dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSDW'. Intrinsic: '_mm_cvtsepi32_epi16'. Requires AVX512F.
func Cvtsepi32Epi8 ¶
Cvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 3 i := 32*j k := 8*j dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVSDB'. Intrinsic: '_mm_cvtsepi32_epi8'. Requires AVX512F.
func Cvtsepi64Epi16 ¶
Cvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 1 i := 64*j k := 16*j dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVSQW'. Intrinsic: '_mm_cvtsepi64_epi16'. Requires AVX512F.
func Cvtsepi64Epi32 ¶
Cvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 1 i := 64*j k := 32*j dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSQD'. Intrinsic: '_mm_cvtsepi64_epi32'. Requires AVX512F.
func Cvtsepi64Epi8 ¶
Cvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 1 i := 64*j k := 8*j dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:16] := 0
Instruction: 'VPMOVSQB'. Intrinsic: '_mm_cvtsepi64_epi8'. Requires AVX512F.
func CvtssI32 ¶
CvtssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.
dst[31:0] := Convert_FP32_To_Int32(a[31:0])
Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvtss_i32'. Requires AVX512F.
func CvtssI64 ¶
CvtssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.
dst[63:0] := Convert_FP32_To_Int64(a[31:0])
Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvtss_i64'. Requires AVX512F.
func CvtssU32 ¶
CvtssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.
dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])
Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvtss_u32'. Requires AVX512F.
func CvtssU64 ¶
CvtssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.
dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])
Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvtss_u64'. Requires AVX512F.
func CvttRoundsdI32 ¶
CvttRoundsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_i32'. Requires AVX512F.
func CvttRoundsdI64 ¶
CvttRoundsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_i64'. Requires AVX512F.
func CvttRoundsdSi32 ¶
CvttRoundsdSi32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_si32'. Requires AVX512F.
func CvttRoundsdSi64 ¶
CvttRoundsdSi64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_si64'. Requires AVX512F.
func CvttRoundsdU32 ¶
CvttRoundsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])
Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvtt_roundsd_u32'. Requires AVX512F.
func CvttRoundsdU64 ¶
CvttRoundsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])
Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvtt_roundsd_u64'. Requires AVX512F.
func CvttRoundssI32 ¶
CvttRoundssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_i32'. Requires AVX512F.
func CvttRoundssI64 ¶
CvttRoundssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_i64'. Requires AVX512F.
func CvttRoundssSi32 ¶
CvttRoundssSi32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_si32'. Requires AVX512F.
func CvttRoundssSi64 ¶
CvttRoundssSi64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_si64'. Requires AVX512F.
func CvttRoundssU32 ¶
CvttRoundssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])
Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvtt_roundss_u32'. Requires AVX512F.
func CvttRoundssU64 ¶
CvttRoundssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])
Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvtt_roundss_u64'. Requires AVX512F.
func CvttpdEpu32 ¶
CvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 1 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k]) ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm_cvttpd_epu32'. Requires AVX512F.
func CvttpsEpu32 ¶
CvttpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 3 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm_cvttps_epu32'. Requires AVX512F.
func CvttsdI32 ¶
CvttsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.
dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvttsd_i32'. Requires AVX512F.
func CvttsdI64 ¶
CvttsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.
dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvttsd_i64'. Requires AVX512F.
func CvttsdU32 ¶
CvttsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.
dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])
Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvttsd_u32'. Requires AVX512F.
func CvttsdU64 ¶
CvttsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.
dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])
Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvttsd_u64'. Requires AVX512F.
func CvttssI32 ¶
CvttssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.
dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvttss_i32'. Requires AVX512F.
func CvttssI64 ¶
CvttssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.
dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvttss_i64'. Requires AVX512F.
func CvttssU32 ¶
CvttssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.
dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])
Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvttss_u32'. Requires AVX512F.
func CvttssU64 ¶
CvttssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.
dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])
Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvttss_u64'. Requires AVX512F.
func Cvtu32Sd ¶
Cvtu32Sd: Convert the unsigned 32-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
dst[63:0] := Convert_UnsignedInt32_To_FP64(b[31:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VCVTUSI2SD'. Intrinsic: '_mm_cvtu32_sd'. Requires AVX512F.
func Cvtu32Ss ¶
Cvtu32Ss: Convert the unsigned 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvtu32_ss'. Requires AVX512F.
func Cvtu64Sd ¶
Cvtu64Sd: Convert the unsigned 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VCVTUSI2SD'. Intrinsic: '_mm_cvtu64_sd'. Requires AVX512F.
func Cvtu64Ss ¶
Cvtu64Ss: Convert the unsigned 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvtu64_ss'. Requires AVX512F.
func Cvtusepi32Epi16 ¶
Cvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 3 i := 32*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSDW'. Intrinsic: '_mm_cvtusepi32_epi16'. Requires AVX512F.
func Cvtusepi32Epi8 ¶
Cvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 3 i := 32*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVUSDB'. Intrinsic: '_mm_cvtusepi32_epi8'. Requires AVX512F.
func Cvtusepi64Epi16 ¶
Cvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 1 i := 64*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVUSQW'. Intrinsic: '_mm_cvtusepi64_epi16'. Requires AVX512F.
func Cvtusepi64Epi32 ¶
Cvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 1 i := 64*j k := 32*j dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSQD'. Intrinsic: '_mm_cvtusepi64_epi32'. Requires AVX512F.
func Cvtusepi64Epi8 ¶
Cvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 1 i := 64*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:16] := 0
Instruction: 'VPMOVUSQB'. Intrinsic: '_mm_cvtusepi64_epi8'. Requires AVX512F.
func DivRoundSd ¶
DivRoundSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := a[63:0] / b[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VDIVSD'. Intrinsic: '_mm_div_round_sd'. Requires AVX512F.
func DivRoundSs ¶
DivRoundSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := a[31:0] / b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VDIVSS'. Intrinsic: '_mm_div_round_ss'. Requires AVX512F.
func FixupimmPd ¶
FixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 1 i := j*64 dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm_fixupimm_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func FixupimmPs ¶
FixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 3 i := j*32 dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm_fixupimm_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func FixupimmRoundSd ¶
func FixupimmRoundSd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)
FixupimmRoundSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_fixupimm_round_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func FixupimmRoundSs ¶
FixupimmRoundSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_fixupimm_round_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func FixupimmSd ¶
FixupimmSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_fixupimm_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func FixupimmSs ¶
FixupimmSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_fixupimm_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func GetexpPd ¶
GetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 1 i := j*64 dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VGETEXPPD'. Intrinsic: '_mm_getexp_pd'. Requires AVX512F.
func GetexpPs ¶
GetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 3 i := j*32 dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VGETEXPPS'. Intrinsic: '_mm_getexp_ps'. Requires AVX512F.
func GetexpRoundSd ¶
GetexpRoundSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := ConvertExpFP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VGETEXPSD'. Intrinsic: '_mm_getexp_round_sd'. Requires AVX512F.
func GetexpRoundSs ¶
GetexpRoundSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := ConvertExpFP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VGETEXPSS'. Intrinsic: '_mm_getexp_round_ss'. Requires AVX512F.
func GetexpSd ¶
GetexpSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
dst[63:0] := ConvertExpFP64(b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VGETEXPSD'. Intrinsic: '_mm_getexp_sd'. Requires AVX512F.
func GetexpSs ¶
GetexpSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
dst[31:0] := ConvertExpFP32(b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VGETEXPSS'. Intrinsic: '_mm_getexp_ss'. Requires AVX512F.
func GetmantPd ¶
GetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 1 i := j*64 dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ENDFOR dst[MAX:128] := 0
Instruction: 'VGETMANTPD'. Intrinsic: '_mm_getmant_pd'. Requires AVX512F.
func GetmantPs ¶
GetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 3 i := j*32 dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR dst[MAX:128] := 0
Instruction: 'VGETMANTPS'. Intrinsic: '_mm_getmant_ps'. Requires AVX512F.
func GetmantRoundSd ¶
func GetmantRoundSd(a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128d)
GetmantRoundSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of: (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VGETMANTSD'. Intrinsic: '_mm_getmant_round_sd'. Requires AVX512F.
func GetmantRoundSs ¶
func GetmantRoundSs(a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128)
GetmantRoundSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of: (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VGETMANTSS'. Intrinsic: '_mm_getmant_round_ss'. Requires AVX512F.
func GetmantSd ¶
func GetmantSd(a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
GetmantSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VGETMANTSD'. Intrinsic: '_mm_getmant_sd'. Requires AVX512F.
func GetmantSs ¶
func GetmantSs(a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
GetmantSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VGETMANTSS'. Intrinsic: '_mm_getmant_ss'. Requires AVX512F.
func M256AbsEpi64 ¶
M256AbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst'.
FOR j := 0 to 3 i := j*64 dst[i+63:i] := ABS(a[i+63:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPABSQ'. Intrinsic: '_mm256_abs_epi64'. Requires AVX512F.
func M256BroadcastF32x4 ¶
M256BroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.
FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:256] := 0
Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm256_broadcast_f32x4'. Requires AVX512F.
func M256BroadcastI32x4 ¶
M256BroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst'.
FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:256] := 0
Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm256_broadcast_i32x4'. Requires AVX512F.
func M256CmpEpi32Mask ¶
M256CmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmp_epi32_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256CmpEpi64Mask ¶
M256CmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmp_epi64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256CmpEpu32Mask ¶
M256CmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmp_epu32_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256CmpEpu64Mask ¶
M256CmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmp_epu64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256CmpPdMask ¶
M256CmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*64 k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VCMPPD'. Intrinsic: '_mm256_cmp_pd_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256CmpPsMask ¶
M256CmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 7 i := j*32 k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VCMPPS'. Intrinsic: '_mm256_cmp_ps_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256CmpeqEpi32Mask ¶
M256CmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpeq_epi32_mask'. Requires AVX512F.
func M256CmpeqEpi64Mask ¶
M256CmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpeq_epi64_mask'. Requires AVX512F.
func M256CmpeqEpu32Mask ¶
M256CmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpeq_epu32_mask'. Requires AVX512F.
func M256CmpeqEpu64Mask ¶
M256CmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpeq_epu64_mask'. Requires AVX512F.
func M256CmpgeEpi32Mask ¶
M256CmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpge_epi32_mask'. Requires AVX512F.
func M256CmpgeEpi64Mask ¶
M256CmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpge_epi64_mask'. Requires AVX512F.
func M256CmpgeEpu32Mask ¶
M256CmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpge_epu32_mask'. Requires AVX512F.
func M256CmpgeEpu64Mask ¶
M256CmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpge_epu64_mask'. Requires AVX512F.
func M256CmpgtEpi32Mask ¶
M256CmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpgt_epi32_mask'. Requires AVX512F.
func M256CmpgtEpi64Mask ¶
M256CmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpgt_epi64_mask'. Requires AVX512F.
func M256CmpgtEpu32Mask ¶
M256CmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpgt_epu32_mask'. Requires AVX512F.
func M256CmpgtEpu64Mask ¶
M256CmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpgt_epu64_mask'. Requires AVX512F.
func M256CmpleEpi32Mask ¶
M256CmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmple_epi32_mask'. Requires AVX512F.
func M256CmpleEpi64Mask ¶
M256CmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmple_epi64_mask'. Requires AVX512F.
func M256CmpleEpu32Mask ¶
M256CmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmple_epu32_mask'. Requires AVX512F.
func M256CmpleEpu64Mask ¶
M256CmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmple_epu64_mask'. Requires AVX512F.
func M256CmpltEpi32Mask ¶
M256CmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmplt_epi32_mask'. Requires AVX512F.
func M256CmpltEpi64Mask ¶
M256CmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmplt_epi64_mask'. Requires AVX512F.
func M256CmpltEpu32Mask ¶
M256CmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmplt_epu32_mask'. Requires AVX512F.
func M256CmpltEpu64Mask ¶
M256CmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmplt_epu64_mask'. Requires AVX512F.
func M256CmpneqEpi32Mask ¶
M256CmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpneq_epi32_mask'. Requires AVX512F.
func M256CmpneqEpi64Mask ¶
M256CmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpneq_epi64_mask'. Requires AVX512F.
func M256CmpneqEpu32Mask ¶
M256CmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*32 k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpneq_epu32_mask'. Requires AVX512F.
func M256CmpneqEpu64Mask ¶
M256CmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.
FOR j := 0 to 3 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpneq_epu64_mask'. Requires AVX512F.
func M256Cvtepi32Epi16 ¶
M256Cvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 16*j dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVDW'. Intrinsic: '_mm256_cvtepi32_epi16'. Requires AVX512F.
func M256Cvtepi32Epi8 ¶
M256Cvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 8*j dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVDB'. Intrinsic: '_mm256_cvtepi32_epi8'. Requires AVX512F.
func M256Cvtepi64Epi16 ¶
M256Cvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 3 i := 64*j k := 16*j dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVQW'. Intrinsic: '_mm256_cvtepi64_epi16'. Requires AVX512F.
func M256Cvtepi64Epi32 ¶
M256Cvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 3 i := 64*j k := 32*j dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQD'. Intrinsic: '_mm256_cvtepi64_epi32'. Requires AVX512F.
func M256Cvtepi64Epi8 ¶
M256Cvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 3 i := 64*j k := 8*j dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQB'. Intrinsic: '_mm256_cvtepi64_epi8'. Requires AVX512F.
func M256Cvtepu32Pd ¶
M256Cvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.
FOR j := 0 to 3 i := j*64 l := j*32 dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm256_cvtepu32_pd'. Requires AVX512F.
func M256CvtpdEpu32 ¶
M256CvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 3 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k]) ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm256_cvtpd_epu32'. Requires AVX512F.
func M256CvtpsEpu32 ¶
M256CvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm256_cvtps_epu32'. Requires AVX512F.
func M256Cvtsepi32Epi16 ¶
M256Cvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 16*j dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSDW'. Intrinsic: '_mm256_cvtsepi32_epi16'. Requires AVX512F.
func M256Cvtsepi32Epi8 ¶
M256Cvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 8*j dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSDB'. Intrinsic: '_mm256_cvtsepi32_epi8'. Requires AVX512F.
func M256Cvtsepi64Epi16 ¶
M256Cvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 3 i := 64*j k := 16*j dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSQW'. Intrinsic: '_mm256_cvtsepi64_epi16'. Requires AVX512F.
func M256Cvtsepi64Epi32 ¶
M256Cvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 3 i := 64*j k := 32*j dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSQD'. Intrinsic: '_mm256_cvtsepi64_epi32'. Requires AVX512F.
func M256Cvtsepi64Epi8 ¶
M256Cvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 3 i := 64*j k := 8*j dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVSQB'. Intrinsic: '_mm256_cvtsepi64_epi8'. Requires AVX512F.
func M256CvttpdEpu32 ¶
M256CvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 3 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k]) ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm256_cvttpd_epu32'. Requires AVX512F.
func M256CvttpsEpu32 ¶
M256CvttpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm256_cvttps_epu32'. Requires AVX512F.
func M256Cvtusepi32Epi16 ¶
M256Cvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSDW'. Intrinsic: '_mm256_cvtusepi32_epi16'. Requires AVX512F.
func M256Cvtusepi32Epi8 ¶
M256Cvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSDB'. Intrinsic: '_mm256_cvtusepi32_epi8'. Requires AVX512F.
func M256Cvtusepi64Epi16 ¶
M256Cvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 3 i := 64*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSQW'. Intrinsic: '_mm256_cvtusepi64_epi16'. Requires AVX512F.
func M256Cvtusepi64Epi32 ¶
M256Cvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 3 i := 64*j k := 32*j dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSQD'. Intrinsic: '_mm256_cvtusepi64_epi32'. Requires AVX512F.
func M256Cvtusepi64Epi8 ¶
M256Cvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 3 i := 64*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVUSQB'. Intrinsic: '_mm256_cvtusepi64_epi8'. Requires AVX512F.
func M256Extractf32x4Ps ¶
M256Extractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.
CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC dst[MAX:128] := 0
Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm256_extractf32x4_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256Extracti32x4Epi32 ¶
M256Extracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.
CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC dst[MAX:128] := 0
Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm256_extracti32x4_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256FixupimmPd ¶
M256FixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 3 i := j*64 dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm256_fixupimm_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256FixupimmPs ¶
M256FixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 7 i := j*32 dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm256_fixupimm_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256GetexpPd ¶
M256GetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 3 i := j*64 dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VGETEXPPD'. Intrinsic: '_mm256_getexp_pd'. Requires AVX512F.
func M256GetexpPs ¶
M256GetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 7 i := j*32 dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VGETEXPPS'. Intrinsic: '_mm256_getexp_ps'. Requires AVX512F.
func M256GetmantPd ¶
M256GetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 3 i := j*64 dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ENDFOR dst[MAX:256] := 0
Instruction: 'VGETMANTPD'. Intrinsic: '_mm256_getmant_pd'. Requires AVX512F.
func M256GetmantPs ¶
M256GetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 7 i := j*32 dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ENDFOR dst[MAX:256] := 0
Instruction: 'VGETMANTPS'. Intrinsic: '_mm256_getmant_ps'. Requires AVX512F.
func M256Insertf32x4 ¶
M256Insertf32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.
dst[255:0] := a[255:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] ESAC dst[MAX:256] := 0
Instruction: 'VINSERTF32X4'. Intrinsic: '_mm256_insertf32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256Inserti32x4 ¶
M256Inserti32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.
dst[255:0] := a[255:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] ESAC dst[MAX:256] := 0
Instruction: 'VINSERTI32X4'. Intrinsic: '_mm256_inserti32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256Mask2Permutex2varEpi32 ¶
func M256Mask2Permutex2varEpi32(a x86.M256i, idx x86.M256i, k x86.Mmask8, b x86.M256i) (dst x86.M256i)
M256Mask2Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 IF k[j] dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2D'. Intrinsic: '_mm256_mask2_permutex2var_epi32'. Requires AVX512F.
func M256Mask2Permutex2varEpi64 ¶
func M256Mask2Permutex2varEpi64(a x86.M256i, idx x86.M256i, k x86.Mmask8, b x86.M256i) (dst x86.M256i)
M256Mask2Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2Q'. Intrinsic: '_mm256_mask2_permutex2var_epi64'. Requires AVX512F.
func M256Mask2Permutex2varPd ¶
M256Mask2Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2PD'. Intrinsic: '_mm256_mask2_permutex2var_pd'. Requires AVX512F.
func M256Mask2Permutex2varPs ¶
M256Mask2Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 IF k[j] dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2PS'. Intrinsic: '_mm256_mask2_permutex2var_ps'. Requires AVX512F.
func M256Mask3FmaddPd ¶
M256Mask3FmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm256_mask3_fmadd_pd'. Requires AVX512F.
func M256Mask3FmaddPs ¶
M256Mask3FmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm256_mask3_fmadd_ps'. Requires AVX512F.
func M256Mask3FmaddsubPd ¶
M256Mask3FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm256_mask3_fmaddsub_pd'. Requires AVX512F.
func M256Mask3FmaddsubPs ¶
M256Mask3FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm256_mask3_fmaddsub_ps'. Requires AVX512F.
func M256Mask3FmsubPd ¶
M256Mask3FmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm256_mask3_fmsub_pd'. Requires AVX512F.
func M256Mask3FmsubPs ¶
M256Mask3FmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm256_mask3_fmsub_ps'. Requires AVX512F.
func M256Mask3FmsubaddPd ¶
M256Mask3FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm256_mask3_fmsubadd_pd'. Requires AVX512F.
func M256Mask3FmsubaddPs ¶
M256Mask3FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm256_mask3_fmsubadd_ps'. Requires AVX512F.
func M256Mask3FnmaddPd ¶
M256Mask3FnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm256_mask3_fnmadd_pd'. Requires AVX512F.
func M256Mask3FnmaddPs ¶
M256Mask3FnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm256_mask3_fnmadd_ps'. Requires AVX512F.
func M256Mask3FnmsubPd ¶
M256Mask3FnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm256_mask3_fnmsub_pd'. Requires AVX512F.
func M256Mask3FnmsubPs ¶
M256Mask3FnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm256_mask3_fnmsub_ps'. Requires AVX512F.
func M256MaskAbsEpi32 ¶
M256MaskAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPABSD'. Intrinsic: '_mm256_mask_abs_epi32'. Requires AVX512F.
func M256MaskAbsEpi64 ¶
M256MaskAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPABSQ'. Intrinsic: '_mm256_mask_abs_epi64'. Requires AVX512F.
func M256MaskAddEpi32 ¶
M256MaskAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPADDD'. Intrinsic: '_mm256_mask_add_epi32'. Requires AVX512F.
func M256MaskAddEpi64 ¶
M256MaskAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPADDQ'. Intrinsic: '_mm256_mask_add_epi64'. Requires AVX512F.
func M256MaskAndEpi32 ¶
M256MaskAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPANDD'. Intrinsic: '_mm256_mask_and_epi32'. Requires AVX512F.
func M256MaskAndEpi64 ¶
M256MaskAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPANDQ'. Intrinsic: '_mm256_mask_and_epi64'. Requires AVX512F.
func M256MaskAndnotEpi32 ¶
M256MaskAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPANDND'. Intrinsic: '_mm256_mask_andnot_epi32'. Requires AVX512F.
func M256MaskAndnotEpi64 ¶
M256MaskAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPANDNQ'. Intrinsic: '_mm256_mask_andnot_epi64'. Requires AVX512F.
func M256MaskBlendEpi32 ¶
M256MaskBlendEpi32: Blend packed 32-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPBLENDMD'. Intrinsic: '_mm256_mask_blend_epi32'. Requires AVX512F.
func M256MaskBlendEpi64 ¶
M256MaskBlendEpi64: Blend packed 64-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPBLENDMQ'. Intrinsic: '_mm256_mask_blend_epi64'. Requires AVX512F.
func M256MaskBlendPd ¶
M256MaskBlendPd: Blend packed double-precision (64-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VBLENDMPD'. Intrinsic: '_mm256_mask_blend_pd'. Requires AVX512F.
func M256MaskBlendPs ¶
M256MaskBlendPs: Blend packed single-precision (32-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VBLENDMPS'. Intrinsic: '_mm256_mask_blend_ps'. Requires AVX512F.
func M256MaskBroadcastF32x4 ¶
M256MaskBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm256_mask_broadcast_f32x4'. Requires AVX512F.
func M256MaskBroadcastI32x4 ¶
M256MaskBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[n+31:n] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm256_mask_broadcast_i32x4'. Requires AVX512F.
func M256MaskBroadcastdEpi32 ¶
M256MaskBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_mask_broadcastd_epi32'. Requires AVX512F.
func M256MaskBroadcastqEpi64 ¶
M256MaskBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_mask_broadcastq_epi64'. Requires AVX512F.
func M256MaskBroadcastsdPd ¶
M256MaskBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VBROADCASTSD'. Intrinsic: '_mm256_mask_broadcastsd_pd'. Requires AVX512F.
func M256MaskBroadcastssPs ¶
M256MaskBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VBROADCASTSS'. Intrinsic: '_mm256_mask_broadcastss_ps'. Requires AVX512F.
func M256MaskCmpEpi32Mask ¶
M256MaskCmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmp_epi32_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskCmpEpi64Mask ¶
M256MaskCmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmp_epi64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskCmpEpu32Mask ¶
M256MaskCmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmp_epu32_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskCmpEpu64Mask ¶
M256MaskCmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmp_epu64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskCmpPdMask ¶
M256MaskCmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VCMPPD'. Intrinsic: '_mm256_mask_cmp_pd_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskCmpPsMask ¶
M256MaskCmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VCMPPS'. Intrinsic: '_mm256_mask_cmp_ps_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskCmpeqEpi32Mask ¶
M256MaskCmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpeq_epi32_mask'. Requires AVX512F.
func M256MaskCmpeqEpi64Mask ¶
M256MaskCmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpeq_epi64_mask'. Requires AVX512F.
func M256MaskCmpeqEpu32Mask ¶
M256MaskCmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpeq_epu32_mask'. Requires AVX512F.
func M256MaskCmpeqEpu64Mask ¶
M256MaskCmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpeq_epu64_mask'. Requires AVX512F.
func M256MaskCmpgeEpi32Mask ¶
M256MaskCmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpge_epi32_mask'. Requires AVX512F.
func M256MaskCmpgeEpi64Mask ¶
M256MaskCmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpge_epi64_mask'. Requires AVX512F.
func M256MaskCmpgeEpu32Mask ¶
M256MaskCmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpge_epu32_mask'. Requires AVX512F.
func M256MaskCmpgeEpu64Mask ¶
M256MaskCmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpge_epu64_mask'. Requires AVX512F.
func M256MaskCmpgtEpi32Mask ¶
M256MaskCmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpgt_epi32_mask'. Requires AVX512F.
func M256MaskCmpgtEpi64Mask ¶
M256MaskCmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpgt_epi64_mask'. Requires AVX512F.
func M256MaskCmpgtEpu32Mask ¶
M256MaskCmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpgt_epu32_mask'. Requires AVX512F.
func M256MaskCmpgtEpu64Mask ¶
M256MaskCmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpgt_epu64_mask'. Requires AVX512F.
func M256MaskCmpleEpi32Mask ¶
M256MaskCmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmple_epi32_mask'. Requires AVX512F.
func M256MaskCmpleEpi64Mask ¶
M256MaskCmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmple_epi64_mask'. Requires AVX512F.
func M256MaskCmpleEpu32Mask ¶
M256MaskCmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmple_epu32_mask'. Requires AVX512F.
func M256MaskCmpleEpu64Mask ¶
M256MaskCmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmple_epu64_mask'. Requires AVX512F.
func M256MaskCmpltEpi32Mask ¶
M256MaskCmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmplt_epi32_mask'. Requires AVX512F.
func M256MaskCmpltEpi64Mask ¶
M256MaskCmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmplt_epi64_mask'. Requires AVX512F.
func M256MaskCmpltEpu32Mask ¶
M256MaskCmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmplt_epu32_mask'. Requires AVX512F.
func M256MaskCmpltEpu64Mask ¶
M256MaskCmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmplt_epu64_mask'. Requires AVX512F.
func M256MaskCmpneqEpi32Mask ¶
M256MaskCmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpneq_epi32_mask'. Requires AVX512F.
func M256MaskCmpneqEpi64Mask ¶
M256MaskCmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpneq_epi64_mask'. Requires AVX512F.
func M256MaskCmpneqEpu32Mask ¶
M256MaskCmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpneq_epu32_mask'. Requires AVX512F.
func M256MaskCmpneqEpu64Mask ¶
M256MaskCmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpneq_epu64_mask'. Requires AVX512F.
func M256MaskCompressEpi32 ¶
M256MaskCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 32 m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[255:m] := src[255:m] dst[MAX:256] := 0
Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm256_mask_compress_epi32'. Requires AVX512F.
func M256MaskCompressEpi64 ¶
M256MaskCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 64 m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[255:m] := src[255:m] dst[MAX:256] := 0
Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm256_mask_compress_epi64'. Requires AVX512F.
func M256MaskCompressPd ¶
M256MaskCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 64 m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[255:m] := src[255:m] dst[MAX:256] := 0
Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm256_mask_compress_pd'. Requires AVX512F.
func M256MaskCompressPs ¶
M256MaskCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 32 m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[255:m] := src[255:m] dst[MAX:256] := 0
Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm256_mask_compress_ps'. Requires AVX512F.
func M256MaskCvtRoundpsPh ¶
M256MaskCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_mask_cvt_roundps_ph'. Requires AVX512F.
func M256MaskCvtepi16Epi32 ¶
M256MaskCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 l := j*16 IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSXWD'. Intrinsic: '_mm256_mask_cvtepi16_epi32'. Requires AVX512F.
func M256MaskCvtepi16Epi64 ¶
M256MaskCvtepi16Epi64: Sign extend packed 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm256_mask_cvtepi16_epi64'. Requires AVX512F.
func M256MaskCvtepi32Epi16 ¶
M256MaskCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVDW'. Intrinsic: '_mm256_mask_cvtepi32_epi16'. Requires AVX512F.
func M256MaskCvtepi32Epi64 ¶
M256MaskCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm256_mask_cvtepi32_epi64'. Requires AVX512F.
func M256MaskCvtepi32Epi8 ¶
M256MaskCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVDB'. Intrinsic: '_mm256_mask_cvtepi32_epi8'. Requires AVX512F.
func M256MaskCvtepi32Pd ¶
M256MaskCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := src[m+63:m] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm256_mask_cvtepi32_pd'. Requires AVX512F.
func M256MaskCvtepi32Ps ¶
M256MaskCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm256_mask_cvtepi32_ps'. Requires AVX512F.
func M256MaskCvtepi64Epi16 ¶
M256MaskCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVQW'. Intrinsic: '_mm256_mask_cvtepi64_epi16'. Requires AVX512F.
func M256MaskCvtepi64Epi32 ¶
M256MaskCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQD'. Intrinsic: '_mm256_mask_cvtepi64_epi32'. Requires AVX512F.
func M256MaskCvtepi64Epi8 ¶
M256MaskCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQB'. Intrinsic: '_mm256_mask_cvtepi64_epi8'. Requires AVX512F.
func M256MaskCvtepi8Epi32 ¶
M256MaskCvtepi8Epi32: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSXBD'. Intrinsic: '_mm256_mask_cvtepi8_epi32'. Requires AVX512F.
func M256MaskCvtepi8Epi64 ¶
M256MaskCvtepi8Epi64: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm256_mask_cvtepi8_epi64'. Requires AVX512F.
func M256MaskCvtepu16Epi32 ¶
M256MaskCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVZXWD'. Intrinsic: '_mm256_mask_cvtepu16_epi32'. Requires AVX512F.
func M256MaskCvtepu16Epi64 ¶
M256MaskCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm256_mask_cvtepu16_epi64'. Requires AVX512F.
func M256MaskCvtepu32Epi64 ¶
M256MaskCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm256_mask_cvtepu32_epi64'. Requires AVX512F.
func M256MaskCvtepu32Pd ¶
M256MaskCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm256_mask_cvtepu32_pd'. Requires AVX512F.
func M256MaskCvtepu8Epi32 ¶
M256MaskCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVZXBD'. Intrinsic: '_mm256_mask_cvtepu8_epi32'. Requires AVX512F.
func M256MaskCvtepu8Epi64 ¶
M256MaskCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm256_mask_cvtepu8_epi64'. Requires AVX512F.
func M256MaskCvtpdEpi32 ¶
M256MaskCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm256_mask_cvtpd_epi32'. Requires AVX512F.
func M256MaskCvtpdEpu32 ¶
M256MaskCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm256_mask_cvtpd_epu32'. Requires AVX512F.
func M256MaskCvtpdPs ¶
M256MaskCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPD2PS'. Intrinsic: '_mm256_mask_cvtpd_ps'. Requires AVX512F.
func M256MaskCvtphPs ¶
M256MaskCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPH2PS'. Intrinsic: '_mm256_mask_cvtph_ps'. Requires AVX512F.
func M256MaskCvtpsEpi32 ¶
M256MaskCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm256_mask_cvtps_epi32'. Requires AVX512F.
func M256MaskCvtpsEpu32 ¶
M256MaskCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm256_mask_cvtps_epu32'. Requires AVX512F.
func M256MaskCvtpsPh ¶
M256MaskCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_mask_cvtps_ph'. Requires AVX512F.
func M256MaskCvtsepi32Epi16 ¶
M256MaskCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSDW'. Intrinsic: '_mm256_mask_cvtsepi32_epi16'. Requires AVX512F.
func M256MaskCvtsepi32Epi8 ¶
M256MaskCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSDB'. Intrinsic: '_mm256_mask_cvtsepi32_epi8'. Requires AVX512F.
func M256MaskCvtsepi64Epi16 ¶
M256MaskCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSQW'. Intrinsic: '_mm256_mask_cvtsepi64_epi16'. Requires AVX512F.
func M256MaskCvtsepi64Epi32 ¶
M256MaskCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSQD'. Intrinsic: '_mm256_mask_cvtsepi64_epi32'. Requires AVX512F.
func M256MaskCvtsepi64Epi8 ¶
M256MaskCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVSQB'. Intrinsic: '_mm256_mask_cvtsepi64_epi8'. Requires AVX512F.
func M256MaskCvttpdEpi32 ¶
M256MaskCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm256_mask_cvttpd_epi32'. Requires AVX512F.
func M256MaskCvttpdEpu32 ¶
M256MaskCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm256_mask_cvttpd_epu32'. Requires AVX512F.
func M256MaskCvttpsEpi32 ¶
M256MaskCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm256_mask_cvttps_epi32'. Requires AVX512F.
func M256MaskCvttpsEpu32 ¶
M256MaskCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm256_mask_cvttps_epu32'. Requires AVX512F.
func M256MaskCvtusepi32Epi16 ¶
M256MaskCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSDW'. Intrinsic: '_mm256_mask_cvtusepi32_epi16'. Requires AVX512F.
func M256MaskCvtusepi32Epi8 ¶
M256MaskCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSDB'. Intrinsic: '_mm256_mask_cvtusepi32_epi8'. Requires AVX512F.
func M256MaskCvtusepi64Epi16 ¶
M256MaskCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSQW'. Intrinsic: '_mm256_mask_cvtusepi64_epi16'. Requires AVX512F.
func M256MaskCvtusepi64Epi32 ¶
M256MaskCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSQD'. Intrinsic: '_mm256_mask_cvtusepi64_epi32'. Requires AVX512F.
func M256MaskCvtusepi64Epi8 ¶
M256MaskCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVUSQB'. Intrinsic: '_mm256_mask_cvtusepi64_epi8'. Requires AVX512F.
func M256MaskDivPd ¶
M256MaskDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VDIVPD'. Intrinsic: '_mm256_mask_div_pd'. Requires AVX512F.
func M256MaskDivPs ¶
M256MaskDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VDIVPS'. Intrinsic: '_mm256_mask_div_ps'. Requires AVX512F.
func M256MaskExpandEpi32 ¶
M256MaskExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPEXPANDD'. Intrinsic: '_mm256_mask_expand_epi32'. Requires AVX512F.
func M256MaskExpandEpi64 ¶
M256MaskExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPEXPANDQ'. Intrinsic: '_mm256_mask_expand_epi64'. Requires AVX512F.
func M256MaskExpandPd ¶
M256MaskExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VEXPANDPD'. Intrinsic: '_mm256_mask_expand_pd'. Requires AVX512F.
func M256MaskExpandPs ¶
M256MaskExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VEXPANDPS'. Intrinsic: '_mm256_mask_expand_ps'. Requires AVX512F.
func M256MaskExtractf32x4Ps ¶
M256MaskExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm256_mask_extractf32x4_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskExtracti32x4Epi32 ¶
M256MaskExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm256_mask_extracti32x4_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskFixupimmPd ¶
func M256MaskFixupimmPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)
M256MaskFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm256_mask_fixupimm_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskFixupimmPs ¶
func M256MaskFixupimmPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)
M256MaskFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm256_mask_fixupimm_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskFmaddPd ¶
M256MaskFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm256_mask_fmadd_pd'. Requires AVX512F.
func M256MaskFmaddPs ¶
M256MaskFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm256_mask_fmadd_ps'. Requires AVX512F.
func M256MaskFmaddsubPd ¶
M256MaskFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm256_mask_fmaddsub_pd'. Requires AVX512F.
func M256MaskFmaddsubPs ¶
M256MaskFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm256_mask_fmaddsub_ps'. Requires AVX512F.
func M256MaskFmsubPd ¶
M256MaskFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm256_mask_fmsub_pd'. Requires AVX512F.
func M256MaskFmsubPs ¶
M256MaskFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm256_mask_fmsub_ps'. Requires AVX512F.
func M256MaskFmsubaddPd ¶
M256MaskFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm256_mask_fmsubadd_pd'. Requires AVX512F.
func M256MaskFmsubaddPs ¶
M256MaskFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm256_mask_fmsubadd_ps'. Requires AVX512F.
func M256MaskFnmaddPd ¶
M256MaskFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm256_mask_fnmadd_pd'. Requires AVX512F.
func M256MaskFnmaddPs ¶
M256MaskFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm256_mask_fnmadd_ps'. Requires AVX512F.
func M256MaskFnmsubPd ¶
M256MaskFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm256_mask_fnmsub_pd'. Requires AVX512F.
func M256MaskFnmsubPs ¶
M256MaskFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm256_mask_fnmsub_ps'. Requires AVX512F.
func M256MaskGetexpPd ¶
M256MaskGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VGETEXPPD'. Intrinsic: '_mm256_mask_getexp_pd'. Requires AVX512F.
func M256MaskGetexpPs ¶
M256MaskGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VGETEXPPS'. Intrinsic: '_mm256_mask_getexp_ps'. Requires AVX512F.
func M256MaskGetmantPd ¶
func M256MaskGetmantPd(src x86.M256d, k x86.Mmask8, a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)
M256MaskGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VGETMANTPD'. Intrinsic: '_mm256_mask_getmant_pd'. Requires AVX512F.
func M256MaskGetmantPs ¶
func M256MaskGetmantPs(src x86.M256, k x86.Mmask8, a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)
M256MaskGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VGETMANTPS'. Intrinsic: '_mm256_mask_getmant_ps'. Requires AVX512F.
func M256MaskInsertf32x4 ¶
func M256MaskInsertf32x4(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)
M256MaskInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VINSERTF32X4'. Intrinsic: '_mm256_mask_insertf32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskInserti32x4 ¶
func M256MaskInserti32x4(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
M256MaskInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VINSERTI32X4'. Intrinsic: '_mm256_mask_inserti32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskMaxEpi32 ¶
M256MaskMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMAXSD'. Intrinsic: '_mm256_mask_max_epi32'. Requires AVX512F.
func M256MaskMaxEpi64 ¶
M256MaskMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMAXSQ'. Intrinsic: '_mm256_mask_max_epi64'. Requires AVX512F.
func M256MaskMaxEpu32 ¶
M256MaskMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMAXUD'. Intrinsic: '_mm256_mask_max_epu32'. Requires AVX512F.
func M256MaskMaxEpu64 ¶
M256MaskMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMAXUQ'. Intrinsic: '_mm256_mask_max_epu64'. Requires AVX512F.
func M256MaskMaxPd ¶
M256MaskMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMAXPD'. Intrinsic: '_mm256_mask_max_pd'. Requires AVX512F.
func M256MaskMaxPs ¶
M256MaskMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMAXPS'. Intrinsic: '_mm256_mask_max_ps'. Requires AVX512F.
func M256MaskMinEpi32 ¶
M256MaskMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMINSD'. Intrinsic: '_mm256_mask_min_epi32'. Requires AVX512F.
func M256MaskMinEpi64 ¶
M256MaskMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMINSQ'. Intrinsic: '_mm256_mask_min_epi64'. Requires AVX512F.
func M256MaskMinEpu32 ¶
M256MaskMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMINUD'. Intrinsic: '_mm256_mask_min_epu32'. Requires AVX512F.
func M256MaskMinEpu64 ¶
M256MaskMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMINUQ'. Intrinsic: '_mm256_mask_min_epu64'. Requires AVX512F.
func M256MaskMinPd ¶
M256MaskMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMINPD'. Intrinsic: '_mm256_mask_min_pd'. Requires AVX512F.
func M256MaskMinPs ¶
M256MaskMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMINPS'. Intrinsic: '_mm256_mask_min_ps'. Requires AVX512F.
func M256MaskMovEpi32 ¶
M256MaskMovEpi32: Move packed 32-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVDQA32'. Intrinsic: '_mm256_mask_mov_epi32'. Requires AVX512F.
func M256MaskMovEpi64 ¶
M256MaskMovEpi64: Move packed 64-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVDQA64'. Intrinsic: '_mm256_mask_mov_epi64'. Requires AVX512F.
func M256MaskMovPd ¶
M256MaskMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVAPD'. Intrinsic: '_mm256_mask_mov_pd'. Requires AVX512F.
func M256MaskMovPs ¶
M256MaskMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVAPS'. Intrinsic: '_mm256_mask_mov_ps'. Requires AVX512F.
func M256MaskMovedupPd ¶
M256MaskMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] tmp[191:128] := a[191:128] tmp[255:192] := a[191:128] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVDDUP'. Intrinsic: '_mm256_mask_movedup_pd'. Requires AVX512F.
func M256MaskMovehdupPs ¶
M256MaskMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] tmp[159:128] := a[191:160] tmp[191:160] := a[191:160] tmp[223:192] := a[255:224] tmp[255:224] := a[255:224] FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVSHDUP'. Intrinsic: '_mm256_mask_movehdup_ps'. Requires AVX512F.
func M256MaskMoveldupPs ¶
M256MaskMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] tmp[159:128] := a[159:128] tmp[191:160] := a[159:128] tmp[223:192] := a[223:192] tmp[255:224] := a[223:192] FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVSLDUP'. Intrinsic: '_mm256_mask_moveldup_ps'. Requires AVX512F.
func M256MaskMulEpi32 ¶
M256MaskMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMULDQ'. Intrinsic: '_mm256_mask_mul_epi32'. Requires AVX512F.
func M256MaskMulEpu32 ¶
M256MaskMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMULUDQ'. Intrinsic: '_mm256_mask_mul_epu32'. Requires AVX512F.
func M256MaskMulPd ¶
M256MaskMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMULPD'. Intrinsic: '_mm256_mask_mul_pd'. Requires AVX512F.
func M256MaskMulPs ¶
M256MaskMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). RM.
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMULPS'. Intrinsic: '_mm256_mask_mul_ps'. Requires AVX512F.
func M256MaskMulloEpi32 ¶
M256MaskMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMULLD'. Intrinsic: '_mm256_mask_mullo_epi32'. Requires AVX512F.
func M256MaskOrEpi32 ¶
M256MaskOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPORD'. Intrinsic: '_mm256_mask_or_epi32'. Requires AVX512F.
func M256MaskOrEpi64 ¶
M256MaskOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPORQ'. Intrinsic: '_mm256_mask_or_epi64'. Requires AVX512F.
func M256MaskPermutePd ¶
M256MaskPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128] IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192] IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128] IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm256_mask_permute_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskPermutePs ¶
M256MaskPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm256_mask_permute_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskPermutevarPd ¶
M256MaskPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] IF (b[129] == 0) tmp_dst[191:128] := a[191:128] IF (b[129] == 1) tmp_dst[191:128] := a[255:192] IF (b[193] == 0) tmp_dst[255:192] := a[191:128] IF (b[193] == 1) tmp_dst[255:192] := a[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm256_mask_permutevar_pd'. Requires AVX512F.
func M256MaskPermutevarPs ¶
M256MaskPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm256_mask_permutevar_ps'. Requires AVX512F.
func M256MaskPermutex2varEpi32 ¶
func M256MaskPermutex2varEpi32(a x86.M256i, k x86.Mmask8, idx x86.M256i, b x86.M256i) (dst x86.M256i)
M256MaskPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMT2D'. Intrinsic: '_mm256_mask_permutex2var_epi32'. Requires AVX512F.
func M256MaskPermutex2varEpi64 ¶
func M256MaskPermutex2varEpi64(a x86.M256i, k x86.Mmask8, idx x86.M256i, b x86.M256i) (dst x86.M256i)
M256MaskPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMT2Q'. Intrinsic: '_mm256_mask_permutex2var_epi64'. Requires AVX512F.
func M256MaskPermutex2varPd ¶
M256MaskPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMT2PD'. Intrinsic: '_mm256_mask_permutex2var_pd'. Requires AVX512F.
func M256MaskPermutex2varPs ¶
M256MaskPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 IF k[j] dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMT2PS'. Intrinsic: '_mm256_mask_permutex2var_ps'. Requires AVX512F.
func M256MaskPermutexEpi64 ¶
M256MaskPermutexEpi64: Shuffle 64-bit integers in 'a' across lanes lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm256_mask_permutex_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskPermutexPd ¶
M256MaskPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm256_mask_permutex_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskPermutexvarEpi32 ¶
func M256MaskPermutexvarEpi32(src x86.M256i, k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)
M256MaskPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMD'. Intrinsic: '_mm256_mask_permutexvar_epi32'. Requires AVX512F.
func M256MaskPermutexvarEpi64 ¶
func M256MaskPermutexvarEpi64(src x86.M256i, k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)
M256MaskPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm256_mask_permutexvar_epi64'. Requires AVX512F.
func M256MaskPermutexvarPd ¶
M256MaskPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm256_mask_permutexvar_pd'. Requires AVX512F.
func M256MaskPermutexvarPs ¶
M256MaskPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMPS'. Intrinsic: '_mm256_mask_permutexvar_ps'. Requires AVX512F.
func M256MaskRcp14Pd ¶
M256MaskRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRCP14PD'. Intrinsic: '_mm256_mask_rcp14_pd'. Requires AVX512F.
func M256MaskRcp14Ps ¶
M256MaskRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRCP14PS'. Intrinsic: '_mm256_mask_rcp14_ps'. Requires AVX512F.
func M256MaskRolEpi32 ¶
M256MaskRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLD'. Intrinsic: '_mm256_mask_rol_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskRolEpi64 ¶
M256MaskRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLQ'. Intrinsic: '_mm256_mask_rol_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskRolvEpi32 ¶
M256MaskRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLVD'. Intrinsic: '_mm256_mask_rolv_epi32'. Requires AVX512F.
func M256MaskRolvEpi64 ¶
M256MaskRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLVQ'. Intrinsic: '_mm256_mask_rolv_epi64'. Requires AVX512F.
func M256MaskRorEpi32 ¶
M256MaskRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORD'. Intrinsic: '_mm256_mask_ror_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskRorEpi64 ¶
M256MaskRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORQ'. Intrinsic: '_mm256_mask_ror_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskRorvEpi32 ¶
M256MaskRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORVD'. Intrinsic: '_mm256_mask_rorv_epi32'. Requires AVX512F.
func M256MaskRorvEpi64 ¶
M256MaskRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORVQ'. Intrinsic: '_mm256_mask_rorv_epi64'. Requires AVX512F.
func M256MaskRoundscalePd ¶
M256MaskRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm256_mask_roundscale_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskRoundscalePs ¶
M256MaskRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm256_mask_roundscale_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskRsqrt14Pd ¶
M256MaskRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRSQRT14PD'. Intrinsic: '_mm256_mask_rsqrt14_pd'. Requires AVX512F.
func M256MaskRsqrt14Ps ¶
M256MaskRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRSQRT14PS'. Intrinsic: '_mm256_mask_rsqrt14_ps'. Requires AVX512F.
func M256MaskScalefPd ¶
M256MaskScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm256_mask_scalef_pd'. Requires AVX512F.
func M256MaskScalefPs ¶
M256MaskScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm256_mask_scalef_ps'. Requires AVX512F.
func M256MaskSet1Epi32 ¶
M256MaskSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_mask_set1_epi32'. Requires AVX512F.
func M256MaskSet1Epi64 ¶
M256MaskSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_mask_set1_epi64'. Requires AVX512F.
func M256MaskShuffleEpi32 ¶
M256MaskShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSHUFD'. Intrinsic: '_mm256_mask_shuffle_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskShuffleF32x4 ¶
func M256MaskShuffleF32x4(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
M256MaskShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[1]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFF32X4'. Intrinsic: '_mm256_mask_shuffle_f32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskShuffleF64x2 ¶
func M256MaskShuffleF64x2(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
M256MaskShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[1]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFF64X2'. Intrinsic: '_mm256_mask_shuffle_f64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskShuffleI32x4 ¶
func M256MaskShuffleI32x4(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
M256MaskShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFI32X4'. Intrinsic: '_mm256_mask_shuffle_i32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskShuffleI64x2 ¶
func M256MaskShuffleI64x2(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
M256MaskShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFI64X2'. Intrinsic: '_mm256_mask_shuffle_i64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskShufflePd ¶
func M256MaskShufflePd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
M256MaskShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFPD'. Intrinsic: '_mm256_mask_shuffle_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskShufflePs ¶
func M256MaskShufflePs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
M256MaskShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFPS'. Intrinsic: '_mm256_mask_shuffle_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskSllEpi32 ¶
M256MaskSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm256_mask_sll_epi32'. Requires AVX512F.
func M256MaskSllEpi64 ¶
M256MaskSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm256_mask_sll_epi64'. Requires AVX512F.
func M256MaskSlliEpi32 ¶
M256MaskSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm256_mask_slli_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskSlliEpi64 ¶
M256MaskSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm256_mask_slli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskSllvEpi32 ¶
M256MaskSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLVD'. Intrinsic: '_mm256_mask_sllv_epi32'. Requires AVX512F.
func M256MaskSllvEpi64 ¶
M256MaskSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLVQ'. Intrinsic: '_mm256_mask_sllv_epi64'. Requires AVX512F.
func M256MaskSqrtPd ¶
M256MaskSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSQRTPD'. Intrinsic: '_mm256_mask_sqrt_pd'. Requires AVX512F.
func M256MaskSqrtPs ¶
M256MaskSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSQRTPS'. Intrinsic: '_mm256_mask_sqrt_ps'. Requires AVX512F.
func M256MaskSraEpi32 ¶
M256MaskSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm256_mask_sra_epi32'. Requires AVX512F.
func M256MaskSraEpi64 ¶
M256MaskSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm256_mask_sra_epi64'. Requires AVX512F.
func M256MaskSraiEpi32 ¶
M256MaskSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm256_mask_srai_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskSraiEpi64 ¶
M256MaskSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm256_mask_srai_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskSravEpi32 ¶
M256MaskSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAVD'. Intrinsic: '_mm256_mask_srav_epi32'. Requires AVX512F.
func M256MaskSravEpi64 ¶
M256MaskSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAVQ'. Intrinsic: '_mm256_mask_srav_epi64'. Requires AVX512F.
func M256MaskSrlEpi32 ¶
M256MaskSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm256_mask_srl_epi32'. Requires AVX512F.
func M256MaskSrlEpi64 ¶
M256MaskSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm256_mask_srl_epi64'. Requires AVX512F.
func M256MaskSrliEpi32 ¶
M256MaskSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm256_mask_srli_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskSrliEpi64 ¶
M256MaskSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm256_mask_srli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskSrlvEpi32 ¶
M256MaskSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLVD'. Intrinsic: '_mm256_mask_srlv_epi32'. Requires AVX512F.
func M256MaskSrlvEpi64 ¶
M256MaskSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLVQ'. Intrinsic: '_mm256_mask_srlv_epi64'. Requires AVX512F.
func M256MaskSubEpi32 ¶
M256MaskSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSUBD'. Intrinsic: '_mm256_mask_sub_epi32'. Requires AVX512F.
func M256MaskSubEpi64 ¶
M256MaskSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSUBQ'. Intrinsic: '_mm256_mask_sub_epi64'. Requires AVX512F.
func M256MaskSubPd ¶
M256MaskSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSUBPD'. Intrinsic: '_mm256_mask_sub_pd'. Requires AVX512F.
func M256MaskSubPs ¶
M256MaskSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSUBPS'. Intrinsic: '_mm256_mask_sub_ps'. Requires AVX512F.
func M256MaskTernarylogicEpi32 ¶
func M256MaskTernarylogicEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
M256MaskTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 32-bit granularity (32-bit elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPTERNLOGD'. Intrinsic: '_mm256_mask_ternarylogic_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskTernarylogicEpi64 ¶
func M256MaskTernarylogicEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
M256MaskTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 64-bit granularity (64-bit elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm256_mask_ternarylogic_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskTestEpi32Mask ¶
M256MaskTestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPTESTMD'. Intrinsic: '_mm256_mask_test_epi32_mask'. Requires AVX512F.
func M256MaskTestEpi64Mask ¶
M256MaskTestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPTESTMQ'. Intrinsic: '_mm256_mask_test_epi64_mask'. Requires AVX512F.
func M256MaskTestnEpi32Mask ¶
M256MaskTestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.
FOR j := 0 to 7 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPTESTNMD'. Intrinsic: '_mm256_mask_testn_epi32_mask'. Requires AVX512F.
func M256MaskTestnEpi64Mask ¶
M256MaskTestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.
FOR j := 0 to 3 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPTESTNMQ'. Intrinsic: '_mm256_mask_testn_epi64_mask'. Requires AVX512F.
func M256MaskUnpackhiEpi32 ¶
M256MaskUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm256_mask_unpackhi_epi32'. Requires AVX512F.
func M256MaskUnpackhiEpi64 ¶
M256MaskUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm256_mask_unpackhi_epi64'. Requires AVX512F.
func M256MaskUnpackhiPd ¶
M256MaskUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VUNPCKHPD'. Intrinsic: '_mm256_mask_unpackhi_pd'. Requires AVX512F.
func M256MaskUnpackhiPs ¶
M256MaskUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VUNPCKHPS'. Intrinsic: '_mm256_mask_unpackhi_ps'. Requires AVX512F.
func M256MaskUnpackloEpi32 ¶
M256MaskUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm256_mask_unpacklo_epi32'. Requires AVX512F.
func M256MaskUnpackloEpi64 ¶
M256MaskUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm256_mask_unpacklo_epi64'. Requires AVX512F.
func M256MaskUnpackloPd ¶
M256MaskUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VUNPCKLPD'. Intrinsic: '_mm256_mask_unpacklo_pd'. Requires AVX512F.
func M256MaskUnpackloPs ¶
M256MaskUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VUNPCKLPS'. Intrinsic: '_mm256_mask_unpacklo_ps'. Requires AVX512F.
func M256MaskXorEpi32 ¶
M256MaskXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPXORD'. Intrinsic: '_mm256_mask_xor_epi32'. Requires AVX512F.
func M256MaskXorEpi64 ¶
M256MaskXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPXORQ'. Intrinsic: '_mm256_mask_xor_epi64'. Requires AVX512F.
func M256MaskzAbsEpi32 ¶
M256MaskzAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPABSD'. Intrinsic: '_mm256_maskz_abs_epi32'. Requires AVX512F.
func M256MaskzAbsEpi64 ¶
M256MaskzAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPABSQ'. Intrinsic: '_mm256_maskz_abs_epi64'. Requires AVX512F.
func M256MaskzAddEpi32 ¶
M256MaskzAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPADDD'. Intrinsic: '_mm256_maskz_add_epi32'. Requires AVX512F.
func M256MaskzAddEpi64 ¶
M256MaskzAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] :=0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPADDQ'. Intrinsic: '_mm256_maskz_add_epi64'. Requires AVX512F.
func M256MaskzAndEpi32 ¶
M256MaskzAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPANDD'. Intrinsic: '_mm256_maskz_and_epi32'. Requires AVX512F.
func M256MaskzAndEpi64 ¶
M256MaskzAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPANDQ'. Intrinsic: '_mm256_maskz_and_epi64'. Requires AVX512F.
func M256MaskzAndnotEpi32 ¶
M256MaskzAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPANDND'. Intrinsic: '_mm256_maskz_andnot_epi32'. Requires AVX512F.
func M256MaskzAndnotEpi64 ¶
M256MaskzAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPANDNQ'. Intrinsic: '_mm256_maskz_andnot_epi64'. Requires AVX512F.
func M256MaskzBroadcastF32x4 ¶
M256MaskzBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm256_maskz_broadcast_f32x4'. Requires AVX512F.
func M256MaskzBroadcastI32x4 ¶
M256MaskzBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm256_maskz_broadcast_i32x4'. Requires AVX512F.
func M256MaskzBroadcastdEpi32 ¶
M256MaskzBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_maskz_broadcastd_epi32'. Requires AVX512F.
func M256MaskzBroadcastqEpi64 ¶
M256MaskzBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_maskz_broadcastq_epi64'. Requires AVX512F.
func M256MaskzBroadcastsdPd ¶
M256MaskzBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VBROADCASTSD'. Intrinsic: '_mm256_maskz_broadcastsd_pd'. Requires AVX512F.
func M256MaskzBroadcastssPs ¶
M256MaskzBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VBROADCASTSS'. Intrinsic: '_mm256_maskz_broadcastss_ps'. Requires AVX512F.
func M256MaskzCompressEpi32 ¶
M256MaskzCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 32 m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[255:m] := 0 dst[MAX:256] := 0
Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm256_maskz_compress_epi32'. Requires AVX512F.
func M256MaskzCompressEpi64 ¶
M256MaskzCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 64 m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[255:m] := 0 dst[MAX:256] := 0
Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm256_maskz_compress_epi64'. Requires AVX512F.
func M256MaskzCompressPd ¶
M256MaskzCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 64 m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[255:m] := 0 dst[MAX:256] := 0
Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm256_maskz_compress_pd'. Requires AVX512F.
func M256MaskzCompressPs ¶
M256MaskzCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 32 m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[255:m] := 0 dst[MAX:256] := 0
Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm256_maskz_compress_ps'. Requires AVX512F.
func M256MaskzCvtRoundpsPh ¶
M256MaskzCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_maskz_cvt_roundps_ph'. Requires AVX512F.
func M256MaskzCvtepi16Epi32 ¶
M256MaskzCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSXWD'. Intrinsic: '_mm256_maskz_cvtepi16_epi32'. Requires AVX512F.
func M256MaskzCvtepi16Epi64 ¶
M256MaskzCvtepi16Epi64: Sign extend packed 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm256_maskz_cvtepi16_epi64'. Requires AVX512F.
func M256MaskzCvtepi32Epi16 ¶
M256MaskzCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVDW'. Intrinsic: '_mm256_maskz_cvtepi32_epi16'. Requires AVX512F.
func M256MaskzCvtepi32Epi64 ¶
M256MaskzCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm256_maskz_cvtepi32_epi64'. Requires AVX512F.
func M256MaskzCvtepi32Epi8 ¶
M256MaskzCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVDB'. Intrinsic: '_mm256_maskz_cvtepi32_epi8'. Requires AVX512F.
func M256MaskzCvtepi32Pd ¶
M256MaskzCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm256_maskz_cvtepi32_pd'. Requires AVX512F.
func M256MaskzCvtepi32Ps ¶
M256MaskzCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm256_maskz_cvtepi32_ps'. Requires AVX512F.
func M256MaskzCvtepi64Epi16 ¶
M256MaskzCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVQW'. Intrinsic: '_mm256_maskz_cvtepi64_epi16'. Requires AVX512F.
func M256MaskzCvtepi64Epi32 ¶
M256MaskzCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQD'. Intrinsic: '_mm256_maskz_cvtepi64_epi32'. Requires AVX512F.
func M256MaskzCvtepi64Epi8 ¶
M256MaskzCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQB'. Intrinsic: '_mm256_maskz_cvtepi64_epi8'. Requires AVX512F.
func M256MaskzCvtepi8Epi32 ¶
M256MaskzCvtepi8Epi32: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSXBD'. Intrinsic: '_mm256_maskz_cvtepi8_epi32'. Requires AVX512F.
func M256MaskzCvtepi8Epi64 ¶
M256MaskzCvtepi8Epi64: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm256_maskz_cvtepi8_epi64'. Requires AVX512F.
func M256MaskzCvtepu16Epi32 ¶
M256MaskzCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVZXWD'. Intrinsic: '_mm256_maskz_cvtepu16_epi32'. Requires AVX512F.
func M256MaskzCvtepu16Epi64 ¶
M256MaskzCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm256_maskz_cvtepu16_epi64'. Requires AVX512F.
func M256MaskzCvtepu32Epi64 ¶
M256MaskzCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm256_maskz_cvtepu32_epi64'. Requires AVX512F.
func M256MaskzCvtepu32Pd ¶
M256MaskzCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm256_maskz_cvtepu32_pd'. Requires AVX512F.
func M256MaskzCvtepu8Epi32 ¶
M256MaskzCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVZXBD'. Intrinsic: '_mm256_maskz_cvtepu8_epi32'. Requires AVX512F.
func M256MaskzCvtepu8Epi64 ¶
M256MaskzCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm256_maskz_cvtepu8_epi64'. Requires AVX512F.
func M256MaskzCvtpdEpi32 ¶
M256MaskzCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm256_maskz_cvtpd_epi32'. Requires AVX512F.
func M256MaskzCvtpdEpu32 ¶
M256MaskzCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm256_maskz_cvtpd_epu32'. Requires AVX512F.
func M256MaskzCvtpdPs ¶
M256MaskzCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPD2PS'. Intrinsic: '_mm256_maskz_cvtpd_ps'. Requires AVX512F.
func M256MaskzCvtphPs ¶
M256MaskzCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPH2PS'. Intrinsic: '_mm256_maskz_cvtph_ps'. Requires AVX512F.
func M256MaskzCvtpsEpi32 ¶
M256MaskzCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm256_maskz_cvtps_epi32'. Requires AVX512F.
func M256MaskzCvtpsEpu32 ¶
M256MaskzCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm256_maskz_cvtps_epu32'. Requires AVX512F.
func M256MaskzCvtpsPh ¶
M256MaskzCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_maskz_cvtps_ph'. Requires AVX512F.
func M256MaskzCvtsepi32Epi16 ¶
M256MaskzCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSDW'. Intrinsic: '_mm256_maskz_cvtsepi32_epi16'. Requires AVX512F.
func M256MaskzCvtsepi32Epi8 ¶
M256MaskzCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSDB'. Intrinsic: '_mm256_maskz_cvtsepi32_epi8'. Requires AVX512F.
func M256MaskzCvtsepi64Epi16 ¶
M256MaskzCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSQW'. Intrinsic: '_mm256_maskz_cvtsepi64_epi16'. Requires AVX512F.
func M256MaskzCvtsepi64Epi32 ¶
M256MaskzCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSQD'. Intrinsic: '_mm256_maskz_cvtsepi64_epi32'. Requires AVX512F.
func M256MaskzCvtsepi64Epi8 ¶
M256MaskzCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVSQB'. Intrinsic: '_mm256_maskz_cvtsepi64_epi8'. Requires AVX512F.
func M256MaskzCvttpdEpi32 ¶
M256MaskzCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm256_maskz_cvttpd_epi32'. Requires AVX512F.
func M256MaskzCvttpdEpu32 ¶
M256MaskzCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm256_maskz_cvttpd_epu32'. Requires AVX512F.
func M256MaskzCvttpsEpi32 ¶
M256MaskzCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm256_maskz_cvttps_epi32'. Requires AVX512F.
func M256MaskzCvttpsEpu32 ¶
M256MaskzCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm256_maskz_cvttps_epu32'. Requires AVX512F.
func M256MaskzCvtusepi32Epi16 ¶
M256MaskzCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSDW'. Intrinsic: '_mm256_maskz_cvtusepi32_epi16'. Requires AVX512F.
func M256MaskzCvtusepi32Epi8 ¶
M256MaskzCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSDB'. Intrinsic: '_mm256_maskz_cvtusepi32_epi8'. Requires AVX512F.
func M256MaskzCvtusepi64Epi16 ¶
M256MaskzCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSQW'. Intrinsic: '_mm256_maskz_cvtusepi64_epi16'. Requires AVX512F.
func M256MaskzCvtusepi64Epi32 ¶
M256MaskzCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSQD'. Intrinsic: '_mm256_maskz_cvtusepi64_epi32'. Requires AVX512F.
func M256MaskzCvtusepi64Epi8 ¶
M256MaskzCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVUSQB'. Intrinsic: '_mm256_maskz_cvtusepi64_epi8'. Requires AVX512F.
func M256MaskzDivPd ¶
M256MaskzDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VDIVPD'. Intrinsic: '_mm256_maskz_div_pd'. Requires AVX512F.
func M256MaskzDivPs ¶
M256MaskzDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VDIVPS'. Intrinsic: '_mm256_maskz_div_ps'. Requires AVX512F.
func M256MaskzExpandEpi32 ¶
M256MaskzExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPEXPANDD'. Intrinsic: '_mm256_maskz_expand_epi32'. Requires AVX512F.
func M256MaskzExpandEpi64 ¶
M256MaskzExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPEXPANDQ'. Intrinsic: '_mm256_maskz_expand_epi64'. Requires AVX512F.
func M256MaskzExpandPd ¶
M256MaskzExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VEXPANDPD'. Intrinsic: '_mm256_maskz_expand_pd'. Requires AVX512F.
func M256MaskzExpandPs ¶
M256MaskzExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VEXPANDPS'. Intrinsic: '_mm256_maskz_expand_ps'. Requires AVX512F.
func M256MaskzExtractf32x4Ps ¶
M256MaskzExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm256_maskz_extractf32x4_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzExtracti32x4Epi32 ¶
M256MaskzExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm256_maskz_extracti32x4_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzFixupimmPd ¶
func M256MaskzFixupimmPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)
M256MaskzFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm256_maskz_fixupimm_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzFixupimmPs ¶
func M256MaskzFixupimmPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)
M256MaskzFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm256_maskz_fixupimm_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzFmaddPd ¶
M256MaskzFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm256_maskz_fmadd_pd'. Requires AVX512F.
func M256MaskzFmaddPs ¶
M256MaskzFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm256_maskz_fmadd_ps'. Requires AVX512F.
func M256MaskzFmaddsubPd ¶
M256MaskzFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm256_maskz_fmaddsub_pd'. Requires AVX512F.
func M256MaskzFmaddsubPs ¶
M256MaskzFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm256_maskz_fmaddsub_ps'. Requires AVX512F.
func M256MaskzFmsubPd ¶
M256MaskzFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm256_maskz_fmsub_pd'. Requires AVX512F.
func M256MaskzFmsubPs ¶
M256MaskzFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm256_maskz_fmsub_ps'. Requires AVX512F.
func M256MaskzFmsubaddPd ¶
M256MaskzFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm256_maskz_fmsubadd_pd'. Requires AVX512F.
func M256MaskzFmsubaddPs ¶
M256MaskzFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm256_maskz_fmsubadd_ps'. Requires AVX512F.
func M256MaskzFnmaddPd ¶
M256MaskzFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm256_maskz_fnmadd_pd'. Requires AVX512F.
func M256MaskzFnmaddPs ¶
M256MaskzFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm256_maskz_fnmadd_ps'. Requires AVX512F.
func M256MaskzFnmsubPd ¶
M256MaskzFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm256_maskz_fnmsub_pd'. Requires AVX512F.
func M256MaskzFnmsubPs ¶
M256MaskzFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm256_maskz_fnmsub_ps'. Requires AVX512F.
func M256MaskzGetexpPd ¶
M256MaskzGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VGETEXPPD'. Intrinsic: '_mm256_maskz_getexp_pd'. Requires AVX512F.
func M256MaskzGetexpPs ¶
M256MaskzGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VGETEXPPS'. Intrinsic: '_mm256_maskz_getexp_ps'. Requires AVX512F.
func M256MaskzGetmantPd ¶
func M256MaskzGetmantPd(k x86.Mmask8, a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)
M256MaskzGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VGETMANTPD'. Intrinsic: '_mm256_maskz_getmant_pd'. Requires AVX512F.
func M256MaskzGetmantPs ¶
func M256MaskzGetmantPs(k x86.Mmask8, a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)
M256MaskzGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VGETMANTPS'. Intrinsic: '_mm256_maskz_getmant_ps'. Requires AVX512F.
func M256MaskzInsertf32x4 ¶
M256MaskzInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VINSERTF32X4'. Intrinsic: '_mm256_maskz_insertf32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzInserti32x4 ¶
M256MaskzInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[255:0] := a[255:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] ESAC FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VINSERTI32X4'. Intrinsic: '_mm256_maskz_inserti32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzMaxEpi32 ¶
M256MaskzMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMAXSD'. Intrinsic: '_mm256_maskz_max_epi32'. Requires AVX512F.
func M256MaskzMaxEpi64 ¶
M256MaskzMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMAXSQ'. Intrinsic: '_mm256_maskz_max_epi64'. Requires AVX512F.
func M256MaskzMaxEpu32 ¶
M256MaskzMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMAXUD'. Intrinsic: '_mm256_maskz_max_epu32'. Requires AVX512F.
func M256MaskzMaxEpu64 ¶
M256MaskzMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMAXUQ'. Intrinsic: '_mm256_maskz_max_epu64'. Requires AVX512F.
func M256MaskzMaxPd ¶
M256MaskzMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMAXPD'. Intrinsic: '_mm256_maskz_max_pd'. Requires AVX512F.
func M256MaskzMaxPs ¶
M256MaskzMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMAXPS'. Intrinsic: '_mm256_maskz_max_ps'. Requires AVX512F.
func M256MaskzMinEpi32 ¶
M256MaskzMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMINSD'. Intrinsic: '_mm256_maskz_min_epi32'. Requires AVX512F.
func M256MaskzMinEpi64 ¶
M256MaskzMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMINSQ'. Intrinsic: '_mm256_maskz_min_epi64'. Requires AVX512F.
func M256MaskzMinEpu32 ¶
M256MaskzMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMINUD'. Intrinsic: '_mm256_maskz_min_epu32'. Requires AVX512F.
func M256MaskzMinEpu64 ¶
M256MaskzMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMINUQ'. Intrinsic: '_mm256_maskz_min_epu64'. Requires AVX512F.
func M256MaskzMinPd ¶
M256MaskzMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMINPD'. Intrinsic: '_mm256_maskz_min_pd'. Requires AVX512F.
func M256MaskzMinPs ¶
M256MaskzMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMINPS'. Intrinsic: '_mm256_maskz_min_ps'. Requires AVX512F.
func M256MaskzMovEpi32 ¶
M256MaskzMovEpi32: Move packed 32-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVDQA32'. Intrinsic: '_mm256_maskz_mov_epi32'. Requires AVX512F.
func M256MaskzMovEpi64 ¶
M256MaskzMovEpi64: Move packed 64-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVDQA64'. Intrinsic: '_mm256_maskz_mov_epi64'. Requires AVX512F.
func M256MaskzMovPd ¶
M256MaskzMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVAPD'. Intrinsic: '_mm256_maskz_mov_pd'. Requires AVX512F.
func M256MaskzMovPs ¶
M256MaskzMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVAPS'. Intrinsic: '_mm256_maskz_mov_ps'. Requires AVX512F.
func M256MaskzMovedupPd ¶
M256MaskzMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] tmp[191:128] := a[191:128] tmp[255:192] := a[191:128] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVDDUP'. Intrinsic: '_mm256_maskz_movedup_pd'. Requires AVX512F.
func M256MaskzMovehdupPs ¶
M256MaskzMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] tmp[159:128] := a[191:160] tmp[191:160] := a[191:160] tmp[223:192] := a[255:224] tmp[255:224] := a[255:224] FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVSHDUP'. Intrinsic: '_mm256_maskz_movehdup_ps'. Requires AVX512F.
func M256MaskzMoveldupPs ¶
M256MaskzMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] tmp[159:128] := a[159:128] tmp[191:160] := a[159:128] tmp[223:192] := a[223:192] tmp[255:224] := a[223:192] FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMOVSLDUP'. Intrinsic: '_mm256_maskz_moveldup_ps'. Requires AVX512F.
func M256MaskzMulEpi32 ¶
M256MaskzMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMULDQ'. Intrinsic: '_mm256_maskz_mul_epi32'. Requires AVX512F.
func M256MaskzMulEpu32 ¶
M256MaskzMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMULUDQ'. Intrinsic: '_mm256_maskz_mul_epu32'. Requires AVX512F.
func M256MaskzMulPd ¶
M256MaskzMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMULPD'. Intrinsic: '_mm256_maskz_mul_pd'. Requires AVX512F.
func M256MaskzMulPs ¶
M256MaskzMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VMULPS'. Intrinsic: '_mm256_maskz_mul_ps'. Requires AVX512F.
func M256MaskzMulloEpi32 ¶
M256MaskzMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMULLD'. Intrinsic: '_mm256_maskz_mullo_epi32'. Requires AVX512F.
func M256MaskzOrEpi32 ¶
M256MaskzOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPORD'. Intrinsic: '_mm256_maskz_or_epi32'. Requires AVX512F.
func M256MaskzOrEpi64 ¶
M256MaskzOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPORQ'. Intrinsic: '_mm256_maskz_or_epi64'. Requires AVX512F.
func M256MaskzPermutePd ¶
M256MaskzPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128] IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192] IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128] IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm256_maskz_permute_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzPermutePs ¶
M256MaskzPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm256_maskz_permute_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzPermutevarPd ¶
M256MaskzPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] IF (b[129] == 0) tmp_dst[191:128] := a[191:128] IF (b[129] == 1) tmp_dst[191:128] := a[255:192] IF (b[193] == 0) tmp_dst[255:192] := a[191:128] IF (b[193] == 1) tmp_dst[255:192] := a[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm256_maskz_permutevar_pd'. Requires AVX512F.
func M256MaskzPermutevarPs ¶
M256MaskzPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm256_maskz_permutevar_ps'. Requires AVX512F.
func M256MaskzPermutex2varEpi32 ¶
func M256MaskzPermutex2varEpi32(k x86.Mmask8, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
M256MaskzPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 IF k[j] dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm256_maskz_permutex2var_epi32'. Requires AVX512F.
func M256MaskzPermutex2varEpi64 ¶
func M256MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
M256MaskzPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm256_maskz_permutex2var_epi64'. Requires AVX512F.
func M256MaskzPermutex2varPd ¶
M256MaskzPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 IF k[j] dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm256_maskz_permutex2var_pd'. Requires AVX512F.
func M256MaskzPermutex2varPs ¶
M256MaskzPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 IF k[j] dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm256_maskz_permutex2var_ps'. Requires AVX512F.
func M256MaskzPermutexEpi64 ¶
M256MaskzPermutexEpi64: Shuffle 64-bit integers in 'a' across lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm256_maskz_permutex_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzPermutexPd ¶
M256MaskzPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm256_maskz_permutex_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzPermutexvarEpi32 ¶
M256MaskzPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMD'. Intrinsic: '_mm256_maskz_permutexvar_epi32'. Requires AVX512F.
func M256MaskzPermutexvarEpi64 ¶
M256MaskzPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm256_maskz_permutexvar_epi64'. Requires AVX512F.
func M256MaskzPermutexvarPd ¶
M256MaskzPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm256_maskz_permutexvar_pd'. Requires AVX512F.
func M256MaskzPermutexvarPs ¶
M256MaskzPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMPS'. Intrinsic: '_mm256_maskz_permutexvar_ps'. Requires AVX512F.
func M256MaskzRcp14Pd ¶
M256MaskzRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRCP14PD'. Intrinsic: '_mm256_maskz_rcp14_pd'. Requires AVX512F.
func M256MaskzRcp14Ps ¶
M256MaskzRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRCP14PS'. Intrinsic: '_mm256_maskz_rcp14_ps'. Requires AVX512F.
func M256MaskzRolEpi32 ¶
M256MaskzRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLD'. Intrinsic: '_mm256_maskz_rol_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzRolEpi64 ¶
M256MaskzRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLQ'. Intrinsic: '_mm256_maskz_rol_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzRolvEpi32 ¶
M256MaskzRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLVD'. Intrinsic: '_mm256_maskz_rolv_epi32'. Requires AVX512F.
func M256MaskzRolvEpi64 ¶
M256MaskzRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLVQ'. Intrinsic: '_mm256_maskz_rolv_epi64'. Requires AVX512F.
func M256MaskzRorEpi32 ¶
M256MaskzRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORD'. Intrinsic: '_mm256_maskz_ror_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzRorEpi64 ¶
M256MaskzRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORQ'. Intrinsic: '_mm256_maskz_ror_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzRorvEpi32 ¶
M256MaskzRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORVD'. Intrinsic: '_mm256_maskz_rorv_epi32'. Requires AVX512F.
func M256MaskzRorvEpi64 ¶
M256MaskzRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORVQ'. Intrinsic: '_mm256_maskz_rorv_epi64'. Requires AVX512F.
func M256MaskzRoundscalePd ¶
M256MaskzRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm256_maskz_roundscale_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzRoundscalePs ¶
M256MaskzRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm256_maskz_roundscale_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzRsqrt14Pd ¶
M256MaskzRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRSQRT14PD'. Intrinsic: '_mm256_maskz_rsqrt14_pd'. Requires AVX512F.
func M256MaskzRsqrt14Ps ¶
M256MaskzRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VRSQRT14PS'. Intrinsic: '_mm256_maskz_rsqrt14_ps'. Requires AVX512F.
func M256MaskzScalefPd ¶
M256MaskzScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm256_maskz_scalef_pd'. Requires AVX512F.
func M256MaskzScalefPs ¶
M256MaskzScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm256_maskz_scalef_ps'. Requires AVX512F.
func M256MaskzSet1Epi32 ¶
M256MaskzSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_maskz_set1_epi32'. Requires AVX512F.
func M256MaskzSet1Epi64 ¶
M256MaskzSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_maskz_set1_epi64'. Requires AVX512F.
func M256MaskzShuffleEpi32 ¶
M256MaskzShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSHUFD'. Intrinsic: '_mm256_maskz_shuffle_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzShuffleF32x4 ¶
M256MaskzShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[1]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFF32X4'. Intrinsic: '_mm256_maskz_shuffle_f32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzShuffleF64x2 ¶
M256MaskzShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[1]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFF64X2'. Intrinsic: '_mm256_maskz_shuffle_f64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzShuffleI32x4 ¶
M256MaskzShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFI32X4'. Intrinsic: '_mm256_maskz_shuffle_i32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzShuffleI64x2 ¶
M256MaskzShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0]) tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFI64X2'. Intrinsic: '_mm256_maskz_shuffle_i64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzShufflePd ¶
M256MaskzShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFPD'. Intrinsic: '_mm256_maskz_shuffle_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzShufflePs ¶
M256MaskzShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSHUFPS'. Intrinsic: '_mm256_maskz_shuffle_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzSllEpi32 ¶
M256MaskzSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm256_maskz_sll_epi32'. Requires AVX512F.
func M256MaskzSllEpi64 ¶
M256MaskzSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm256_maskz_sll_epi64'. Requires AVX512F.
func M256MaskzSlliEpi32 ¶
M256MaskzSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm256_maskz_slli_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzSlliEpi64 ¶
M256MaskzSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm256_maskz_slli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzSllvEpi32 ¶
M256MaskzSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLVD'. Intrinsic: '_mm256_maskz_sllv_epi32'. Requires AVX512F.
func M256MaskzSllvEpi64 ¶
M256MaskzSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSLLVQ'. Intrinsic: '_mm256_maskz_sllv_epi64'. Requires AVX512F.
func M256MaskzSqrtPd ¶
M256MaskzSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSQRTPD'. Intrinsic: '_mm256_maskz_sqrt_pd'. Requires AVX512F.
func M256MaskzSqrtPs ¶
M256MaskzSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSQRTPS'. Intrinsic: '_mm256_maskz_sqrt_ps'. Requires AVX512F.
func M256MaskzSraEpi32 ¶
M256MaskzSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm256_maskz_sra_epi32'. Requires AVX512F.
func M256MaskzSraEpi64 ¶
M256MaskzSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm256_maskz_sra_epi64'. Requires AVX512F.
func M256MaskzSraiEpi32 ¶
M256MaskzSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm256_maskz_srai_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzSraiEpi64 ¶
M256MaskzSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm256_maskz_srai_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzSravEpi32 ¶
M256MaskzSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAVD'. Intrinsic: '_mm256_maskz_srav_epi32'. Requires AVX512F.
func M256MaskzSravEpi64 ¶
M256MaskzSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAVQ'. Intrinsic: '_mm256_maskz_srav_epi64'. Requires AVX512F.
func M256MaskzSrlEpi32 ¶
M256MaskzSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm256_maskz_srl_epi32'. Requires AVX512F.
func M256MaskzSrlEpi64 ¶
M256MaskzSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm256_maskz_srl_epi64'. Requires AVX512F.
func M256MaskzSrliEpi32 ¶
M256MaskzSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm256_maskz_srli_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzSrliEpi64 ¶
M256MaskzSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm256_maskz_srli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzSrlvEpi32 ¶
M256MaskzSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLVD'. Intrinsic: '_mm256_maskz_srlv_epi32'. Requires AVX512F.
func M256MaskzSrlvEpi64 ¶
M256MaskzSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRLVQ'. Intrinsic: '_mm256_maskz_srlv_epi64'. Requires AVX512F.
func M256MaskzSubEpi32 ¶
M256MaskzSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSUBD'. Intrinsic: '_mm256_maskz_sub_epi32'. Requires AVX512F.
func M256MaskzSubEpi64 ¶
M256MaskzSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSUBQ'. Intrinsic: '_mm256_maskz_sub_epi64'. Requires AVX512F.
func M256MaskzSubPd ¶
M256MaskzSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSUBPD'. Intrinsic: '_mm256_maskz_sub_pd'. Requires AVX512F.
func M256MaskzSubPs ¶
M256MaskzSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VSUBPS'. Intrinsic: '_mm256_maskz_sub_ps'. Requires AVX512F.
func M256MaskzTernarylogicEpi32 ¶
func M256MaskzTernarylogicEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)
M256MaskzTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPTERNLOGD'. Intrinsic: '_mm256_maskz_ternarylogic_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzTernarylogicEpi64 ¶
func M256MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)
M256MaskzTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm256_maskz_ternarylogic_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256MaskzUnpackhiEpi32 ¶
M256MaskzUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm256_maskz_unpackhi_epi32'. Requires AVX512F.
func M256MaskzUnpackhiEpi64 ¶
M256MaskzUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm256_maskz_unpackhi_epi64'. Requires AVX512F.
func M256MaskzUnpackhiPd ¶
M256MaskzUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VUNPCKHPD'. Intrinsic: '_mm256_maskz_unpackhi_pd'. Requires AVX512F.
func M256MaskzUnpackhiPs ¶
M256MaskzUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VUNPCKHPS'. Intrinsic: '_mm256_maskz_unpackhi_ps'. Requires AVX512F.
func M256MaskzUnpackloEpi32 ¶
M256MaskzUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm256_maskz_unpacklo_epi32'. Requires AVX512F.
func M256MaskzUnpackloEpi64 ¶
M256MaskzUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm256_maskz_unpacklo_epi64'. Requires AVX512F.
func M256MaskzUnpackloPd ¶
M256MaskzUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VUNPCKLPD'. Intrinsic: '_mm256_maskz_unpacklo_pd'. Requires AVX512F.
func M256MaskzUnpackloPs ¶
M256MaskzUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VUNPCKLPS'. Intrinsic: '_mm256_maskz_unpacklo_ps'. Requires AVX512F.
func M256MaskzXorEpi32 ¶
M256MaskzXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPXORD'. Intrinsic: '_mm256_maskz_xor_epi32'. Requires AVX512F.
func M256MaskzXorEpi64 ¶
M256MaskzXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPXORQ'. Intrinsic: '_mm256_maskz_xor_epi64'. Requires AVX512F.
func M256MaxEpi64 ¶
M256MaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.
FOR j := 0 to 3 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMAXSQ'. Intrinsic: '_mm256_max_epi64'. Requires AVX512F.
func M256MaxEpu64 ¶
M256MaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.
FOR j := 0 to 3 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMAXUQ'. Intrinsic: '_mm256_max_epu64'. Requires AVX512F.
func M256MinEpi64 ¶
M256MinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.
FOR j := 0 to 3 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMINSQ'. Intrinsic: '_mm256_min_epi64'. Requires AVX512F.
func M256MinEpu64 ¶
M256MinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.
FOR j := 0 to 3 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMINUQ'. Intrinsic: '_mm256_min_epu64'. Requires AVX512F.
func M256Permutex2varEpi32 ¶
M256Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm256_permutex2var_epi32'. Requires AVX512F.
func M256Permutex2varEpi64 ¶
M256Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm256_permutex2var_epi64'. Requires AVX512F.
func M256Permutex2varPd ¶
M256Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 3 i := j*64 off := idx[i+1:i]*64 dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm256_permutex2var_pd'. Requires AVX512F.
func M256Permutex2varPs ¶
M256Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 7 i := j*32 off := idx[i+2:i]*32 dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm256_permutex2var_ps'. Requires AVX512F.
func M256PermutexEpi64 ¶
M256PermutexEpi64: Shuffle 64-bit integers in 'a' across lanes using the control in 'imm8', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } dst[63:0] := SELECT4(a[255:0], imm8[1:0]) dst[127:64] := SELECT4(a[255:0], imm8[3:2]) dst[191:128] := SELECT4(a[255:0], imm8[5:4]) dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[MAX:256] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm256_permutex_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256PermutexPd ¶
M256PermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the control in 'imm8', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } dst[63:0] := SELECT4(a[255:0], imm8[1:0]) dst[127:64] := SELECT4(a[255:0], imm8[3:2]) dst[191:128] := SELECT4(a[255:0], imm8[5:4]) dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[MAX:256] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm256_permutex_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256PermutexvarEpi32 ¶
M256PermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.
FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMD'. Intrinsic: '_mm256_permutexvar_epi32'. Requires AVX512F.
func M256PermutexvarEpi64 ¶
M256PermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.
FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 dst[i+63:i] := a[id+63:id] ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm256_permutexvar_epi64'. Requires AVX512F.
func M256PermutexvarPd ¶
M256PermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.
FOR j := 0 to 3 i := j*64 id := idx[i+1:i]*64 dst[i+63:i] := a[id+63:id] ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm256_permutexvar_pd'. Requires AVX512F.
func M256PermutexvarPs ¶
M256PermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx'.
FOR j := 0 to 7 i := j*32 id := idx[i+2:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:256] := 0
Instruction: 'VPERMPS'. Intrinsic: '_mm256_permutexvar_ps'. Requires AVX512F.
func M256Rcp14Pd ¶
M256Rcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 3 i := j*64 dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VRCP14PD'. Intrinsic: '_mm256_rcp14_pd'. Requires AVX512F.
func M256Rcp14Ps ¶
M256Rcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*32 dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VRCP14PS'. Intrinsic: '_mm256_rcp14_ps'. Requires AVX512F.
func M256RolEpi32 ¶
M256RolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLD'. Intrinsic: '_mm256_rol_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256RolEpi64 ¶
M256RolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLQ'. Intrinsic: '_mm256_rol_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256RolvEpi32 ¶
M256RolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 7 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLVD'. Intrinsic: '_mm256_rolv_epi32'. Requires AVX512F.
func M256RolvEpi64 ¶
M256RolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 3 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPROLVQ'. Intrinsic: '_mm256_rolv_epi64'. Requires AVX512F.
func M256RorEpi32 ¶
M256RorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORD'. Intrinsic: '_mm256_ror_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256RorEpi64 ¶
M256RorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORQ'. Intrinsic: '_mm256_ror_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256RorvEpi32 ¶
M256RorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 7 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORVD'. Intrinsic: '_mm256_rorv_epi32'. Requires AVX512F.
func M256RorvEpi64 ¶
M256RorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 3 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPRORVQ'. Intrinsic: '_mm256_rorv_epi64'. Requires AVX512F.
func M256RoundscalePd ¶
M256RoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm256_roundscale_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256RoundscalePs ¶
M256RoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:256] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm256_roundscale_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256ScalefPd ¶
M256ScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 3 i := j*64 dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm256_scalef_pd'. Requires AVX512F.
func M256ScalefPs ¶
M256ScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 7 i := j*32 dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm256_scalef_ps'. Requires AVX512F.
func M256ShuffleF32x4 ¶
M256ShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.
SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT2(a[255:0], imm8[0]) dst[255:128] := SELECT2(b[255:0], imm8[1]) dst[MAX:256] := 0
Instruction: 'VSHUFF32X4'. Intrinsic: '_mm256_shuffle_f32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256ShuffleF64x2 ¶
M256ShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT2(a[255:0], imm8[0]) dst[255:128] := SELECT2(b[255:0], imm8[1]) dst[MAX:256] := 0
Instruction: 'VSHUFF64X2'. Intrinsic: '_mm256_shuffle_f64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256ShuffleI32x4 ¶
M256ShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.
SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT2(a[255:0], imm8[1:0]) dst[255:128] := SELECT2(b[255:0], imm8[3:2]) dst[MAX:256] := 0
Instruction: 'VSHUFI32X4'. Intrinsic: '_mm256_shuffle_i32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256ShuffleI64x2 ¶
M256ShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.
SELECT2(src, control){ CASE(control[0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT2(a[255:0], imm8[1:0]) dst[255:128] := SELECT2(b[255:0], imm8[3:2]) dst[MAX:256] := 0
Instruction: 'VSHUFI64X2'. Intrinsic: '_mm256_shuffle_i64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256SraEpi64 ¶
M256SraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.
FOR j := 0 to 3 i := j*64 IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm256_sra_epi64'. Requires AVX512F.
func M256SraiEpi64 ¶
M256SraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.
FOR j := 0 to 3 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm256_srai_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256SravEpi64 ¶
M256SravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.
FOR j := 0 to 3 i := j*64 dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPSRAVQ'. Intrinsic: '_mm256_srav_epi64'. Requires AVX512F.
func M256TernarylogicEpi32 ¶
M256TernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.
FOR j := 0 to 7 i := j*32 FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:256] := 0
Instruction: 'VPTERNLOGD'. Intrinsic: '_mm256_ternarylogic_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256TernarylogicEpi64 ¶
M256TernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.
FOR j := 0 to 3 i := j*64 FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:256] := 0
Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm256_ternarylogic_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M256TestEpi32Mask ¶
M256TestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.
FOR j := 0 to 7 i := j*32 k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPTESTMD'. Intrinsic: '_mm256_test_epi32_mask'. Requires AVX512F.
func M256TestEpi64Mask ¶
M256TestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.
FOR j := 0 to 3 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPTESTMQ'. Intrinsic: '_mm256_test_epi64_mask'. Requires AVX512F.
func M256TestnEpi32Mask ¶
M256TestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.
FOR j := 0 to 7 i := j*32 k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPTESTNMD'. Intrinsic: '_mm256_testn_epi32_mask'. Requires AVX512F.
func M256TestnEpi64Mask ¶
M256TestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.
FOR j := 0 to 3 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPTESTNMQ'. Intrinsic: '_mm256_testn_epi64_mask'. Requires AVX512F.
func M512AbsEpi32 ¶
M512AbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ABS(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPABSD'. Intrinsic: '_mm512_abs_epi32'. Requires AVX512F.
func M512AbsEpi64 ¶
M512AbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ABS(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPABSQ'. Intrinsic: '_mm512_abs_epi64'. Requires AVX512F.
func M512AcosPd ¶
M512AcosPd: Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ACOS(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_acos_pd'. Requires AVX512F.
func M512AcosPs ¶
M512AcosPs: Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ACOS(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_acos_ps'. Requires AVX512F.
func M512AcoshPd ¶
M512AcoshPd: Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ACOSH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_acosh_pd'. Requires AVX512F.
func M512AcoshPs ¶
M512AcoshPs: Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ACOSH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_acosh_ps'. Requires AVX512F.
func M512AddEpi64 ¶
M512AddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] + b[i+63:i] ENDFOR dst[MAX:512] := 0
Instruction: 'VPADDQ'. Intrinsic: '_mm512_add_epi64'. Requires AVX512F.
func M512AlignrEpi64 ¶
M512AlignrEpi64: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 64-bit elements, and store the low 64 bytes (8 elements) in 'dst'.
temp[1023:512] := a[511:0] temp[511:0] := b[511:0] temp[1023:0] := temp[1023:0] >> (64*count) dst[511:0] := temp[511:0] dst[MAX:512] := 0
Instruction: 'VALIGNQ'. Intrinsic: '_mm512_alignr_epi64'. Requires AVX512F.
func M512AsinPd ¶
M512AsinPd: Compute the inverse sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ASIN(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_asin_pd'. Requires AVX512F.
func M512AsinPs ¶
M512AsinPs: Compute the inverse sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ASIN(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_asin_ps'. Requires AVX512F.
func M512AsinhPd ¶
M512AsinhPd: Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ASINH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_asinh_pd'. Requires AVX512F.
func M512AsinhPs ¶
M512AsinhPs: Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ASINH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_asinh_ps'. Requires AVX512F.
func M512Atan2Pd ¶
M512Atan2Pd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_atan2_pd'. Requires AVX512F.
func M512Atan2Ps ¶
M512Atan2Ps: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_atan2_ps'. Requires AVX512F.
func M512AtanPd ¶
M512AtanPd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' and store the results in 'dst' expressed in radians.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ATAN(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_atan_pd'. Requires AVX512F.
func M512AtanPs ¶
M512AtanPs: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ATAN(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_atan_ps'. Requires AVX512F.
func M512AtanhPd ¶
M512AtanhPd: Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' and store the results in 'dst' expressed in radians.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ATANH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_atanh_pd'. Requires AVX512F.
func M512AtanhPs ¶
M512AtanhPs: Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ATANH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_atanh_ps'. Requires AVX512F.
func M512BroadcastF32x4 ¶
M512BroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.
FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm512_broadcast_f32x4'. Requires AVX512F.
func M512BroadcastF64x4 ¶
M512BroadcastF64x4: Broadcast the 4 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst'.
FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 dst[i+63:i] := a[n+63:n] ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTF64X4'. Intrinsic: '_mm512_broadcast_f64x4'. Requires AVX512F.
func M512BroadcastI32x4 ¶
M512BroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst'.
FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 dst[i+31:i] := a[n+31:n] ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm512_broadcast_i32x4'. Requires AVX512F.
func M512BroadcastI64x4 ¶
M512BroadcastI64x4: Broadcast the 4 packed 64-bit integers from 'a' to all elements of 'dst'.
FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 dst[i+63:i] := a[n+63:n] ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTI64X4'. Intrinsic: '_mm512_broadcast_i64x4'. Requires AVX512F.
func M512BroadcastdEpi32 ¶
M512BroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_broadcastd_epi32'. Requires AVX512F.
func M512BroadcastqEpi64 ¶
M512BroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_broadcastq_epi64'. Requires AVX512F.
func M512BroadcastsdPd ¶
M512BroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTSD'. Intrinsic: '_mm512_broadcastsd_pd'. Requires AVX512F.
func M512BroadcastssPs ¶
M512BroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTSS'. Intrinsic: '_mm512_broadcastss_ps'. Requires AVX512F.
func M512Castpd128Pd512 ¶
M512Castpd128Pd512: Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castpd128_pd512'. Requires AVX512F.
func M512Castpd256Pd512 ¶
M512Castpd256Pd512: Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castpd256_pd512'. Requires AVX512F.
func M512Castpd512Pd128 ¶
M512Castpd512Pd128: Cast vector of type __m512d to type __m128d.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castpd512_pd128'. Requires AVX512F.
func M512Castpd512Pd256 ¶
M512Castpd512Pd256: Cast vector of type __m512d to type __m256d.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castpd512_pd256'. Requires AVX512F.
func M512Castps128Ps512 ¶
M512Castps128Ps512: Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castps128_ps512'. Requires AVX512F.
func M512Castps256Ps512 ¶
M512Castps256Ps512: Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castps256_ps512'. Requires AVX512F.
func M512Castps512Ps128 ¶
M512Castps512Ps128: Cast vector of type __m512 to type __m128.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castps512_ps128'. Requires AVX512F.
func M512Castps512Ps256 ¶
M512Castps512Ps256: Cast vector of type __m512 to type __m256.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castps512_ps256'. Requires AVX512F.
func M512Castsi128Si512 ¶
M512Castsi128Si512: Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castsi128_si512'. Requires AVX512F.
func M512Castsi256Si512 ¶
M512Castsi256Si512: Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castsi256_si512'. Requires AVX512F.
func M512Castsi512Si128 ¶
M512Castsi512Si128: Cast vector of type __m512i to type __m128i.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castsi512_si128'. Requires AVX512F.
func M512Castsi512Si256 ¶
M512Castsi512Si256: Cast vector of type __m512i to type __m256i.
This intrinsic is only used for compilation and does not generate any
instructions, thus it has zero latency.
Instruction: ”. Intrinsic: '_mm512_castsi512_si256'. Requires AVX512F.
func M512CbrtPd ¶
M512CbrtPd: Compute the cube root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := CubeRoot(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cbrt_pd'. Requires AVX512F.
func M512CbrtPs ¶
M512CbrtPs: Compute the cube root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := CubeRoot(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cbrt_ps'. Requires AVX512F.
func M512CdfnormPd ¶
M512CdfnormPd: Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := CDFNormal(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cdfnorm_pd'. Requires AVX512F.
func M512CdfnormPs ¶
M512CdfnormPs: Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := CDFNormal(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cdfnorm_ps'. Requires AVX512F.
func M512CdfnorminvPd ¶
M512CdfnorminvPd: Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := InverseCDFNormal(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cdfnorminv_pd'. Requires AVX512F.
func M512CdfnorminvPs ¶
M512CdfnorminvPs: Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := InverseCDFNormal(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cdfnorminv_ps'. Requires AVX512F.
func M512CeilPd ¶
M512CeilPd: Round the packed double-precision (64-bit) floating-point elements in 'a' up to an integer value, and store the results as packed double-precision floating-point elements in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := CEIL(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_ceil_pd'. Requires AVX512F.
func M512CeilPs ¶
M512CeilPs: Round the packed single-precision (32-bit) floating-point elements in 'a' up to an integer value, and store the results as packed single-precision floating-point elements in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := CEIL(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_ceil_ps'. Requires AVX512F.
func M512CmpEpi64Mask ¶
M512CmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmp_epi64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512CmpEpu64Mask ¶
M512CmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmp_epu64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512CmpeqEpi64Mask ¶
M512CmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPEQQ'. Intrinsic: '_mm512_cmpeq_epi64_mask'. Requires AVX512F.
func M512CmpeqEpu64Mask ¶
M512CmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpeq_epu64_mask'. Requires AVX512F.
func M512CmpgeEpi64Mask ¶
M512CmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmpge_epi64_mask'. Requires AVX512F.
func M512CmpgeEpu64Mask ¶
M512CmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpge_epu64_mask'. Requires AVX512F.
func M512CmpgtEpi64Mask ¶
M512CmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPGTQ'. Intrinsic: '_mm512_cmpgt_epi64_mask'. Requires AVX512F.
func M512CmpgtEpu64Mask ¶
M512CmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpgt_epu64_mask'. Requires AVX512F.
func M512CmpleEpi64Mask ¶
M512CmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmple_epi64_mask'. Requires AVX512F.
func M512CmpleEpu64Mask ¶
M512CmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmple_epu64_mask'. Requires AVX512F.
func M512CmpltEpi32Mask ¶
M512CmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 15 i := j*32 k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ENDFOR k[MAX:16] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm512_cmplt_epi32_mask'. Requires AVX512F.
func M512CmpltEpi64Mask ¶
M512CmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmplt_epi64_mask'. Requires AVX512F.
func M512CmpltEpu64Mask ¶
M512CmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmplt_epu64_mask'. Requires AVX512F.
func M512CmpneqEpi64Mask ¶
M512CmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmpneq_epi64_mask'. Requires AVX512F.
func M512CmpneqEpu64Mask ¶
M512CmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.
FOR j := 0 to 7 i := j*64 k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpneq_epu64_mask'. Requires AVX512F.
func M512CosPd ¶
M512CosPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := COS(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cos_pd'. Requires AVX512F.
func M512CosPs ¶
M512CosPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := COS(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cos_ps'. Requires AVX512F.
func M512CosdPd ¶
M512CosdPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := COSD(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cosd_pd'. Requires AVX512F.
func M512CosdPs ¶
M512CosdPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := COSD(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cosd_ps'. Requires AVX512F.
func M512CoshPd ¶
M512CoshPd: Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := COSH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cosh_pd'. Requires AVX512F.
func M512CoshPs ¶
M512CoshPs: Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := COSH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_cosh_ps'. Requires AVX512F.
func M512CvtRoundepi32Ps ¶
M512CvtRoundepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_cvt_roundepi32_ps'. Requires AVX512F.
func M512CvtRoundepu32Ps ¶
M512CvtRoundepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_cvt_roundepu32_ps'. Requires AVX512F.
func M512CvtRoundpdEpi32 ¶
M512CvtRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_cvt_roundpd_epi32'. Requires AVX512F.
func M512CvtRoundpdEpu32 ¶
M512CvtRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_cvt_roundpd_epu32'. Requires AVX512F.
func M512CvtRoundpdPs ¶
M512CvtRoundpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_cvt_roundpd_ps'. Requires AVX512F.
func M512CvtRoundphPs ¶
M512CvtRoundphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := j*32 m := j*16 dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_cvt_roundph_ps'. Requires AVX512F.
func M512CvtRoundpsEpi32 ¶
M512CvtRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_cvt_roundps_epi32'. Requires AVX512F.
func M512CvtRoundpsEpu32 ¶
M512CvtRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_cvt_roundps_epu32'. Requires AVX512F.
func M512CvtRoundpsPd ¶
M512CvtRoundpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := 64*j k := 32*j dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_cvt_roundps_pd'. Requires AVX512F.
func M512CvtRoundpsPh ¶
M512CvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 16*j l := 32*j dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_cvt_roundps_ph'. Requires AVX512F.
func M512Cvtepi16Epi32 ¶
M512Cvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j k := 16*j dst[i+31:i] := SignExtend(a[k+15:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXWD'. Intrinsic: '_mm512_cvtepi16_epi32'. Requires AVX512F.
func M512Cvtepi16Epi64 ¶
M512Cvtepi16Epi64: Sign extend packed 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 16*j dst[i+63:i] := SignExtend(a[k+15:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm512_cvtepi16_epi64'. Requires AVX512F.
func M512Cvtepi32Epi16 ¶
M512Cvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j k := 16*j dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVDW'. Intrinsic: '_mm512_cvtepi32_epi16'. Requires AVX512F.
func M512Cvtepi32Epi64 ¶
M512Cvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 32*j dst[i+63:i] := SignExtend(a[k+31:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm512_cvtepi32_epi64'. Requires AVX512F.
func M512Cvtepi32Epi8 ¶
M512Cvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j k := 8*j dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVDB'. Intrinsic: '_mm512_cvtepi32_epi8'. Requires AVX512F.
func M512Cvtepi32Pd ¶
M512Cvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.
FOR j := 0 to 7 i := j*32 m := j*64 dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_cvtepi32_pd'. Requires AVX512F.
func M512Cvtepi32Ps ¶
M512Cvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_cvtepi32_ps'. Requires AVX512F.
func M512Cvtepi64Epi16 ¶
M512Cvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 16*j dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQW'. Intrinsic: '_mm512_cvtepi64_epi16'. Requires AVX512F.
func M512Cvtepi64Epi32 ¶
M512Cvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 32*j dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVQD'. Intrinsic: '_mm512_cvtepi64_epi32'. Requires AVX512F.
func M512Cvtepi64Epi8 ¶
M512Cvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 8*j dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQB'. Intrinsic: '_mm512_cvtepi64_epi8'. Requires AVX512F.
func M512Cvtepi8Epi32 ¶
M512Cvtepi8Epi32: Sign extend packed 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j k := 8*j dst[i+31:i] := SignExtend(a[k+7:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXBD'. Intrinsic: '_mm512_cvtepi8_epi32'. Requires AVX512F.
func M512Cvtepi8Epi64 ¶
M512Cvtepi8Epi64: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 8*j dst[i+63:i] := SignExtend(a[k+7:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm512_cvtepi8_epi64'. Requires AVX512F.
func M512Cvtepu16Epi32 ¶
M512Cvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j k := 16*j dst[i+31:i] := ZeroExtend(a[k+15:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXWD'. Intrinsic: '_mm512_cvtepu16_epi32'. Requires AVX512F.
func M512Cvtepu16Epi64 ¶
M512Cvtepu16Epi64: Zero extend packed unsigned 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 16*j dst[i+63:i] := ZeroExtend(a[k+15:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm512_cvtepu16_epi64'. Requires AVX512F.
func M512Cvtepu32Epi64 ¶
M512Cvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 32*j dst[i+63:i] := ZeroExtend(a[k+31:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm512_cvtepu32_epi64'. Requires AVX512F.
func M512Cvtepu32Pd ¶
M512Cvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 l := j*32 dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_cvtepu32_pd'. Requires AVX512F.
func M512Cvtepu32Ps ¶
M512Cvtepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_cvtepu32_ps'. Requires AVX512F.
func M512Cvtepu8Epi32 ¶
M512Cvtepu8Epi32: Zero extend packed unsigned 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j k := 8*j dst[i+31:i] := ZeroExtend(a[k+7:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXBD'. Intrinsic: '_mm512_cvtepu8_epi32'. Requires AVX512F.
func M512Cvtepu8Epi64 ¶
M512Cvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 8 byte sof 'a' to packed 64-bit integers, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 8*j dst[i+63:i] := ZeroExtend(a[k+7:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm512_cvtepu8_epi64'. Requires AVX512F.
func M512CvtpdEpi32 ¶
M512CvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_cvtpd_epi32'. Requires AVX512F.
func M512CvtpdEpu32 ¶
M512CvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_cvtpd_epu32'. Requires AVX512F.
func M512CvtpdPs ¶
M512CvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_cvtpd_ps'. Requires AVX512F.
func M512CvtphPs ¶
M512CvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 m := j*16 dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_cvtph_ps'. Requires AVX512F.
func M512CvtpsEpi32 ¶
M512CvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_cvtps_epi32'. Requires AVX512F.
func M512CvtpsEpu32 ¶
M512CvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_cvtps_epu32'. Requires AVX512F.
func M512CvtpsPd ¶
M512CvtpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 32*j dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_cvtps_pd'. Requires AVX512F.
func M512CvtpsPh ¶
M512CvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 16*j l := 32*j dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_cvtps_ph'. Requires AVX512F.
func M512Cvtsepi32Epi16 ¶
M512Cvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j k := 16*j dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSDW'. Intrinsic: '_mm512_cvtsepi32_epi16'. Requires AVX512F.
func M512Cvtsepi32Epi8 ¶
M512Cvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j k := 8*j dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSDB'. Intrinsic: '_mm512_cvtsepi32_epi8'. Requires AVX512F.
func M512Cvtsepi64Epi16 ¶
M512Cvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 16*j dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSQW'. Intrinsic: '_mm512_cvtsepi64_epi16'. Requires AVX512F.
func M512Cvtsepi64Epi32 ¶
M512Cvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 32*j dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSQD'. Intrinsic: '_mm512_cvtsepi64_epi32'. Requires AVX512F.
func M512Cvtsepi64Epi8 ¶
M512Cvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 8*j dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSQB'. Intrinsic: '_mm512_cvtsepi64_epi8'. Requires AVX512F.
func M512CvttRoundpdEpi32 ¶
M512CvttRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := 32*i k := 64*j dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[k+63:k]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_cvtt_roundpd_epi32'. Requires AVX512F.
func M512CvttRoundpdEpu32 ¶
M512CvttRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := 32*i k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[k+63:k]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_cvtt_roundpd_epu32'. Requires AVX512F.
func M512CvttRoundpsEpi32 ¶
M512CvttRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := 32*i dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_cvtt_roundps_epi32'. Requires AVX512F.
func M512CvttRoundpsEpu32 ¶
M512CvttRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := 32*i dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_cvtt_roundps_epu32'. Requires AVX512F.
func M512CvttpdEpi32 ¶
M512CvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_cvttpd_epi32'. Requires AVX512F.
func M512CvttpdEpu32 ¶
M512CvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 7 i := 32*j k := 64*j dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k]) ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_cvttpd_epu32'. Requires AVX512F.
func M512CvttpsEpi32 ¶
M512CvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_cvttps_epi32'. Requires AVX512F.
func M512CvttpsEpu32 ¶
M512CvttpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_cvttps_epu32'. Requires AVX512F.
func M512Cvtusepi32Epi16 ¶
M512Cvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVUSDW'. Intrinsic: '_mm512_cvtusepi32_epi16'. Requires AVX512F.
func M512Cvtusepi32Epi8 ¶
M512Cvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSDB'. Intrinsic: '_mm512_cvtusepi32_epi8'. Requires AVX512F.
func M512Cvtusepi64Epi16 ¶
M512Cvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 16*j dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSQW'. Intrinsic: '_mm512_cvtusepi64_epi16'. Requires AVX512F.
func M512Cvtusepi64Epi32 ¶
M512Cvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 32*j dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVUSQD'. Intrinsic: '_mm512_cvtusepi64_epi32'. Requires AVX512F.
func M512Cvtusepi64Epi8 ¶
M512Cvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j k := 8*j dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSQB'. Intrinsic: '_mm512_cvtusepi64_epi8'. Requires AVX512F.
func M512DivEpi16 ¶
M512DivEpi16: Divide packed 16-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.
FOR j := 0 to 31 i := 16*j dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_div_epi16'. Requires AVX512F.
func M512DivEpi32 ¶
M512DivEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_div_epi32'. Requires AVX512F.
func M512DivEpi64 ¶
M512DivEpi64: Divide packed 64-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.
FOR j := 0 to 7 i := 64*j dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_div_epi64'. Requires AVX512F.
func M512DivEpi8 ¶
M512DivEpi8: Divide packed 8-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.
FOR j := 0 to 63 i := 8*j dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_div_epi8'. Requires AVX512F.
func M512DivEpu16 ¶
M512DivEpu16: Divide packed unsigned 16-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.
FOR j := 0 to 31 i := 16*j dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_div_epu16'. Requires AVX512F.
func M512DivEpu32 ¶
M512DivEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_div_epu32'. Requires AVX512F.
func M512DivEpu64 ¶
M512DivEpu64: Divide packed unsigned 64-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.
FOR j := 0 to 7 i := 64*j dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_div_epu64'. Requires AVX512F.
func M512DivEpu8 ¶
M512DivEpu8: Divide packed unsigned 8-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.
FOR j := 0 to 63 i := 8*j dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_div_epu8'. Requires AVX512F.
func M512DivPd ¶
M512DivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.
FOR j := 0 to 7 i := 64*j dst[i+63:i] := a[i+63:i] / b[i+63:i] ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPD'. Intrinsic: '_mm512_div_pd'. Requires AVX512F.
func M512DivPs ¶
M512DivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := a[i+31:i] / b[i+31:i] ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPS'. Intrinsic: '_mm512_div_ps'. Requires AVX512F.
func M512DivRoundPd ¶
M512DivRoundPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', =and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 64*j dst[i+63:i] := a[i+63:i] / b[i+63:i] ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPD'. Intrinsic: '_mm512_div_round_pd'. Requires AVX512F.
func M512DivRoundPs ¶
M512DivRoundPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j dst[i+31:i] := a[i+31:i] / b[i+31:i] ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPS'. Intrinsic: '_mm512_div_round_ps'. Requires AVX512F.
func M512ErfPd ¶
M512ErfPd: Compute the error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ERF(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_erf_pd'. Requires AVX512F.
func M512ErfPs ¶
M512ErfPs: Compute the error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ERF(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_erf_ps'. Requires AVX512F.
func M512ErfcPd ¶
M512ErfcPd: Compute the complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := 1.0 - ERF(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_erfc_pd'. Requires AVX512F.
func M512ErfcPs ¶
M512ErfcPs: Compute the complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := 1.0 - ERF(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_erfc_ps'. Requires AVX512F.
func M512ErfcinvPd ¶
M512ErfcinvPd: Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_erfcinv_pd'. Requires AVX512F.
func M512ErfcinvPs ¶
M512ErfcinvPs: Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i])) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_erfcinv_ps'. Requires AVX512F.
func M512ErfinvPd ¶
M512ErfinvPd: Compute the inverse error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := 1.0 / ERF(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_erfinv_pd'. Requires AVX512F.
func M512ErfinvPs ¶
M512ErfinvPs: Compute the inverse error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := 1.0 / ERF(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_erfinv_ps'. Requires AVX512F.
func M512Exp10Pd ¶
M512Exp10Pd: Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := 10^(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_exp10_pd'. Requires AVX512F.
func M512Exp10Ps ¶
M512Exp10Ps: Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := 10^(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_exp10_ps'. Requires AVX512F.
func M512Exp2Pd ¶
M512Exp2Pd: Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := 2^(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_exp2_pd'. Requires AVX512F.
func M512Exp2Ps ¶
M512Exp2Ps: Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := 2^(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_exp2_ps'. Requires AVX512F.
func M512ExpPd ¶
M512ExpPd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := e^(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_exp_pd'. Requires AVX512F.
func M512ExpPs ¶
M512ExpPs: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := e^(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_exp_ps'. Requires AVX512F.
func M512Expm1Pd ¶
M512Expm1Pd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := e^(a[i+63:i]) - 1.0 ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_expm1_pd'. Requires AVX512F.
func M512Expm1Ps ¶
M512Expm1Ps: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := e^(a[i+31:i]) - 1.0 ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_expm1_ps'. Requires AVX512F.
func M512Extractf32x4Ps ¶
M512Extractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.
CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] 2: dst[127:0] := a[383:256] 3: dst[127:0] := a[511:384] ESAC dst[MAX:128] := 0
Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm512_extractf32x4_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512Extractf64x4Pd ¶
M512Extractf64x4Pd: Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.
CASE imm8[7:0] of 0: dst[255:0] := a[255:0] 1: dst[255:0] := a[511:256] ESAC dst[MAX:256] := 0
Instruction: 'VEXTRACTF64X4'. Intrinsic: '_mm512_extractf64x4_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512Extracti32x4Epi32 ¶
M512Extracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.
CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] 2: dst[127:0] := a[383:256] 3: dst[127:0] := a[511:384] ESAC dst[MAX:128] := 0
Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm512_extracti32x4_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512Extracti64x4Epi64 ¶
M512Extracti64x4Epi64: Extract 256 bits (composed of 4 packed 64-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.
CASE imm8[7:0] of 0: dst[255:0] := a[255:0] 1: dst[255:0] := a[511:256] ESAC dst[MAX:256] := 0
Instruction: 'VEXTRACTI64X4'. Intrinsic: '_mm512_extracti64x4_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512FixupimmPd ¶
M512FixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_fixupimm_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512FixupimmPs ¶
M512FixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_fixupimm_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512FixupimmRoundPd ¶
func M512FixupimmRoundPd(a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)
M512FixupimmRoundPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_fixupimm_round_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512FixupimmRoundPs ¶
func M512FixupimmRoundPs(a x86.M512, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)
M512FixupimmRoundPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_fixupimm_round_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512FloorPd ¶
M512FloorPd: Round the packed double-precision (64-bit) floating-point elements in 'a' down to an integer value, and store the results as packed double-precision floating-point elements in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := FLOOR(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_floor_pd'. Requires AVX512F.
func M512FloorPs ¶
M512FloorPs: Round the packed single-precision (32-bit) floating-point elements in 'a' down to an integer value, and store the results as packed single-precision floating-point elements in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := FLOOR(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_floor_ps'. Requires AVX512F.
func M512FmaddsubPd ¶
M512FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_fmaddsub_pd'. Requires AVX512F.
func M512FmaddsubPs ¶
M512FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_fmaddsub_ps'. Requires AVX512F.
func M512FmaddsubRoundPd ¶
M512FmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_fmaddsub_round_pd'. Requires AVX512F.
func M512FmaddsubRoundPs ¶
M512FmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_fmaddsub_round_ps'. Requires AVX512F.
func M512FmsubaddPd ¶
M512FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_fmsubadd_pd'. Requires AVX512F.
func M512FmsubaddPs ¶
M512FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_fmsubadd_ps'. Requires AVX512F.
func M512FmsubaddRoundPd ¶
M512FmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_fmsubadd_round_pd'. Requires AVX512F.
func M512FmsubaddRoundPs ¶
M512FmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_fmsubadd_round_ps'. Requires AVX512F.
func M512HypotPd ¶
M512HypotPd: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_hypot_pd'. Requires AVX512F.
func M512HypotPs ¶
M512HypotPs: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_hypot_ps'. Requires AVX512F.
func M512Insertf32x4 ¶
M512Insertf32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.
dst[511:0] := a[511:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] 2: dst[383:256] := b[127:0] 3: dst[511:384] := b[127:0] ESAC dst[MAX:512] := 0
Instruction: 'VINSERTF32X4'. Intrinsic: '_mm512_insertf32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512Insertf64x4 ¶
M512Insertf64x4: Copy 'a' to 'dst', then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.
dst[511:0] := a[511:0] CASE (imm8[0]) of 0: dst[255:0] := b[255:0] 1: dst[511:256] := b[255:0] ESAC dst[MAX:512] := 0
Instruction: 'VINSERTF64X4'. Intrinsic: '_mm512_insertf64x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512Inserti32x4 ¶
M512Inserti32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.
dst[511:0] := a[511:0] CASE (imm8[1:0]) of 0: dst[127:0] := b[127:0] 1: dst[255:128] := b[127:0] 2: dst[383:256] := b[127:0] 3: dst[511:384] := b[127:0] ESAC dst[MAX:512] := 0
Instruction: 'VINSERTI32X4'. Intrinsic: '_mm512_inserti32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512Inserti64x4 ¶
M512Inserti64x4: Copy 'a' to 'dst', then insert 256 bits (composed of 4 packed 64-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.
dst[511:0] := a[511:0] CASE (imm8[7:0]) OF 0: dst[255:0] := b[255:0] 1: dst[511:256] := b[255:0] ESAC dst[MAX:512] := 0
Instruction: 'VINSERTI64X4'. Intrinsic: '_mm512_inserti64x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512InvsqrtPd ¶
M512InvsqrtPd: Compute the inverse square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := InvSQRT(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_invsqrt_pd'. Requires AVX512F.
func M512InvsqrtPs ¶
M512InvsqrtPs: Compute the inverse square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := InvSQRT(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_invsqrt_ps'. Requires AVX512F.
func M512Kand ¶
M512Kand: Compute the bitwise AND of 16-bit masks 'a' and 'b', and store the result in 'k'.
k[15:0] := a[15:0] AND b[15:0] k[MAX:16] := 0
Instruction: 'KANDW'. Intrinsic: '_mm512_kand'. Requires AVX512F.
func M512Kandn ¶
M512Kandn: Compute the bitwise AND NOT of 16-bit masks 'a' and 'b', and store the result in 'k'.
k[15:0] := (NOT a[15:0]) AND b[15:0] k[MAX:16] := 0
Instruction: 'KANDNW'. Intrinsic: '_mm512_kandn'. Requires AVX512F.
func M512Kmov ¶
M512Kmov: Copy 16-bit mask 'a' to 'k'.
k[15:0] := a[15:0] k[MAX:16] := 0
Instruction: 'KMOVW'. Intrinsic: '_mm512_kmov'. Requires AVX512F.
func M512Knot ¶
M512Knot: Compute the bitwise NOT of 16-bit mask 'a', and store the result in 'k'.
k[15:0] := NOT a[15:0] k[MAX:16] := 0
Instruction: 'KNOTW'. Intrinsic: '_mm512_knot'. Requires AVX512F.
func M512Kor ¶
M512Kor: Compute the bitwise OR of 16-bit masks 'a' and 'b', and store the result in 'k'.
k[15:0] := a[15:0] OR b[15:0] k[MAX:16] := 0
Instruction: 'KORW'. Intrinsic: '_mm512_kor'. Requires AVX512F.
func M512Kortestc ¶
M512Kortestc: Performs bitwise OR between 'k1' and 'k2', storing the result in 'dst'. CF flag is set if 'dst' consists of all 1's.
dst[15:0] := k1[15:0] | k2[15:0] IF PopCount(dst[15:0]) = 16 SetCF() FI
Instruction: 'KORTESTW'. Intrinsic: '_mm512_kortestc'. Requires AVX512F.
func M512Kortestz ¶
M512Kortestz: Performs bitwise OR between 'k1' and 'k2', storing the result in 'dst'. ZF flag is set if 'dst' is 0.
dst[15:0] := k1[15:0] | k2[15:0] IF dst = 0 SetZF() FI
Instruction: 'KORTESTW'. Intrinsic: '_mm512_kortestz'. Requires AVX512F.
func M512Kunpackb ¶
M512Kunpackb: Unpack and interleave 8 bits from masks 'a' and 'b', and store the 16-bit result in 'k'.
k[7:0] := b[7:0] k[15:8] := a[7:0] k[MAX:16] := 0
Instruction: 'KUNPCKBW'. Intrinsic: '_mm512_kunpackb'. Requires AVX512F.
func M512Kxnor ¶
M512Kxnor: Compute the bitwise XNOR of 16-bit masks 'a' and 'b', and store the result in 'k'.
k[15:0] := NOT (a[15:0] XOR b[15:0]) k[MAX:16] := 0
Instruction: 'KXNORW'. Intrinsic: '_mm512_kxnor'. Requires AVX512F.
func M512Kxor ¶
M512Kxor: Compute the bitwise XOR of 16-bit masks 'a' and 'b', and store the result in 'k'.
k[15:0] := a[15:0] XOR b[15:0] k[MAX:16] := 0
Instruction: 'KXORW'. Intrinsic: '_mm512_kxor'. Requires AVX512F.
func M512Log10Pd ¶
M512Log10Pd: Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := log10(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_log10_pd'. Requires AVX512F.
func M512Log10Ps ¶
M512Log10Ps: Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := log10(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_log10_ps'. Requires AVX512F.
func M512Log1pPd ¶
M512Log1pPd: Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ln(1.0 + a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_log1p_pd'. Requires AVX512F.
func M512Log1pPs ¶
M512Log1pPs: Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ln(1.0 + a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_log1p_ps'. Requires AVX512F.
func M512Log2Pd ¶
M512Log2Pd: Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := log2(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_log2_pd'. Requires AVX512F.
func M512LogPd ¶
M512LogPd: Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ln(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_log_pd'. Requires AVX512F.
func M512LogPs ¶
M512LogPs: Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ln(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_log_ps'. Requires AVX512F.
func M512LogbPd ¶
M512LogbPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_logb_pd'. Requires AVX512F.
func M512LogbPs ¶
M512LogbPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_logb_ps'. Requires AVX512F.
func M512Mask2Permutex2varEpi32 ¶
func M512Mask2Permutex2varEpi32(a x86.M512i, idx x86.M512i, k x86.Mmask16, b x86.M512i) (dst x86.M512i)
M512Mask2Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2D'. Intrinsic: '_mm512_mask2_permutex2var_epi32'. Requires AVX512F.
func M512Mask2Permutex2varEpi64 ¶
func M512Mask2Permutex2varEpi64(a x86.M512i, idx x86.M512i, k x86.Mmask8, b x86.M512i) (dst x86.M512i)
M512Mask2Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2Q'. Intrinsic: '_mm512_mask2_permutex2var_epi64'. Requires AVX512F.
func M512Mask2Permutex2varPd ¶
M512Mask2Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set)
FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2PD'. Intrinsic: '_mm512_mask2_permutex2var_pd'. Requires AVX512F.
func M512Mask2Permutex2varPs ¶
M512Mask2Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2PS'. Intrinsic: '_mm512_mask2_permutex2var_ps'. Requires AVX512F.
func M512Mask3FmaddsubPd ¶
M512Mask3FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask3_fmaddsub_pd'. Requires AVX512F.
func M512Mask3FmaddsubPs ¶
M512Mask3FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask3_fmaddsub_ps'. Requires AVX512F.
func M512Mask3FmaddsubRoundPd ¶
func M512Mask3FmaddsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)
M512Mask3FmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask3_fmaddsub_round_pd'. Requires AVX512F.
func M512Mask3FmaddsubRoundPs ¶
func M512Mask3FmaddsubRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)
M512Mask3FmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask3_fmaddsub_round_ps'. Requires AVX512F.
func M512Mask3FmsubaddPd ¶
M512Mask3FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask3_fmsubadd_pd'. Requires AVX512F.
func M512Mask3FmsubaddPs ¶
M512Mask3FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask3_fmsubadd_ps'. Requires AVX512F.
func M512Mask3FmsubaddRoundPd ¶
func M512Mask3FmsubaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)
M512Mask3FmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask3_fmsubadd_round_pd'. Requires AVX512F.
func M512Mask3FmsubaddRoundPs ¶
func M512Mask3FmsubaddRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)
M512Mask3FmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask3_fmsubadd_round_ps'. Requires AVX512F.
func M512MaskAbsEpi32 ¶
M512MaskAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPABSD'. Intrinsic: '_mm512_mask_abs_epi32'. Requires AVX512F.
func M512MaskAbsEpi64 ¶
M512MaskAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPABSQ'. Intrinsic: '_mm512_mask_abs_epi64'. Requires AVX512F.
func M512MaskAcosPd ¶
M512MaskAcosPd: Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ACOS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_acos_pd'. Requires AVX512F.
func M512MaskAcosPs ¶
M512MaskAcosPs: Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ACOS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_acos_ps'. Requires AVX512F.
func M512MaskAcoshPd ¶
M512MaskAcoshPd: Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ACOSH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_acosh_pd'. Requires AVX512F.
func M512MaskAcoshPs ¶
M512MaskAcoshPs: Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ACOSH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_acosh_ps'. Requires AVX512F.
func M512MaskAddEpi64 ¶
M512MaskAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPADDQ'. Intrinsic: '_mm512_mask_add_epi64'. Requires AVX512F.
func M512MaskAlignrEpi64 ¶
func M512MaskAlignrEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
M512MaskAlignrEpi64: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 64-bit elements, and store the low 64 bytes (8 elements) in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
temp[1023:512] := a[511:0] temp[511:0] := b[511:0] temp[1023:0] := temp[1023:0] >> (64*count) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := temp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VALIGNQ'. Intrinsic: '_mm512_mask_alignr_epi64'. Requires AVX512F.
func M512MaskAsinPd ¶
M512MaskAsinPd: Compute the inverse sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ASIN(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_asin_pd'. Requires AVX512F.
func M512MaskAsinPs ¶
M512MaskAsinPs: Compute the inverse sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ASIN(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_asin_ps'. Requires AVX512F.
func M512MaskAsinhPd ¶
M512MaskAsinhPd: Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ASINH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_asinh_pd'. Requires AVX512F.
func M512MaskAsinhPs ¶
M512MaskAsinhPs: Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ASINH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_asinh_ps'. Requires AVX512F.
func M512MaskAtan2Pd ¶
M512MaskAtan2Pd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to i := j*64 IF k[j] dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_atan2_pd'. Requires AVX512F.
func M512MaskAtan2Ps ¶
M512MaskAtan2Ps: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_atan2_ps'. Requires AVX512F.
func M512MaskAtanPd ¶
M512MaskAtanPd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ATAN(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_atan_pd'. Requires AVX512F.
func M512MaskAtanPs ¶
M512MaskAtanPs: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ATAN(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_atan_ps'. Requires AVX512F.
func M512MaskAtanhPd ¶
M512MaskAtanhPd: Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ATANH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_atanh_pd'. Requires AVX512F.
func M512MaskAtanhPs ¶
M512MaskAtanhPs: Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ATANH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_atanh_ps'. Requires AVX512F.
func M512MaskBroadcastF32x4 ¶
M512MaskBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm512_mask_broadcast_f32x4'. Requires AVX512F.
func M512MaskBroadcastF64x4 ¶
M512MaskBroadcastF64x4: Broadcast the 4 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := src[n+63:n] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTF64X4'. Intrinsic: '_mm512_mask_broadcast_f64x4'. Requires AVX512F.
func M512MaskBroadcastI32x4 ¶
M512MaskBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := src[n+31:n] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm512_mask_broadcast_i32x4'. Requires AVX512F.
func M512MaskBroadcastI64x4 ¶
M512MaskBroadcastI64x4: Broadcast the 4 packed 64-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := src[n+63:n] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTI64X4'. Intrinsic: '_mm512_mask_broadcast_i64x4'. Requires AVX512F.
func M512MaskBroadcastdEpi32 ¶
M512MaskBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_mask_broadcastd_epi32'. Requires AVX512F.
func M512MaskBroadcastqEpi64 ¶
M512MaskBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_mask_broadcastq_epi64'. Requires AVX512F.
func M512MaskBroadcastsdPd ¶
M512MaskBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTSD'. Intrinsic: '_mm512_mask_broadcastsd_pd'. Requires AVX512F.
func M512MaskBroadcastssPs ¶
M512MaskBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTSS'. Intrinsic: '_mm512_mask_broadcastss_ps'. Requires AVX512F.
func M512MaskCbrtPd ¶
M512MaskCbrtPd: Compute the cube root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := CubeRoot(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cbrt_pd'. Requires AVX512F.
func M512MaskCbrtPs ¶
M512MaskCbrtPs: Compute the cube root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := CubeRoot(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cbrt_ps'. Requires AVX512F.
func M512MaskCdfnormPd ¶
M512MaskCdfnormPd: Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := CDFNormal(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorm_pd'. Requires AVX512F.
func M512MaskCdfnormPs ¶
M512MaskCdfnormPs: Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := CDFNormal(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorm_ps'. Requires AVX512F.
func M512MaskCdfnorminvPd ¶
M512MaskCdfnorminvPd: Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := InverseCDFNormal(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorminv_pd'. Requires AVX512F.
func M512MaskCdfnorminvPs ¶
M512MaskCdfnorminvPs: Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := InverseCDFNormal(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorminv_ps'. Requires AVX512F.
func M512MaskCeilPd ¶
M512MaskCeilPd: Round the packed double-precision (64-bit) floating-point elements in 'a' up to an integer value, and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := CEIL(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_ceil_pd'. Requires AVX512F.
func M512MaskCeilPs ¶
M512MaskCeilPs: Round the packed single-precision (32-bit) floating-point elements in 'a' up to an integer value, and store the results as packed single-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := CEIL(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_ceil_ps'. Requires AVX512F.
func M512MaskCmpEpi64Mask ¶
M512MaskCmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmp_epi64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskCmpEpu64Mask ¶
M512MaskCmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmp_epu64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskCmpeqEpi64Mask ¶
M512MaskCmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPEQQ'. Intrinsic: '_mm512_mask_cmpeq_epi64_mask'. Requires AVX512F.
func M512MaskCmpeqEpu64Mask ¶
M512MaskCmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpeq_epu64_mask'. Requires AVX512F.
func M512MaskCmpgeEpi64Mask ¶
M512MaskCmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmpge_epi64_mask'. Requires AVX512F.
func M512MaskCmpgeEpu64Mask ¶
M512MaskCmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpge_epu64_mask'. Requires AVX512F.
func M512MaskCmpgtEpi64Mask ¶
M512MaskCmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPGTQ'. Intrinsic: '_mm512_mask_cmpgt_epi64_mask'. Requires AVX512F.
func M512MaskCmpgtEpu64Mask ¶
M512MaskCmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpgt_epu64_mask'. Requires AVX512F.
func M512MaskCmpleEpi64Mask ¶
M512MaskCmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmple_epi64_mask'. Requires AVX512F.
func M512MaskCmpleEpu64Mask ¶
M512MaskCmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmple_epu64_mask'. Requires AVX512F.
func M512MaskCmpltEpi32Mask ¶
M512MaskCmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm512_mask_cmplt_epi32_mask'. Requires AVX512F.
func M512MaskCmpltEpi64Mask ¶
M512MaskCmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmplt_epi64_mask'. Requires AVX512F.
func M512MaskCmpltEpu64Mask ¶
M512MaskCmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmplt_epu64_mask'. Requires AVX512F.
func M512MaskCmpneqEpi64Mask ¶
M512MaskCmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmpneq_epi64_mask'. Requires AVX512F.
func M512MaskCmpneqEpu64Mask ¶
M512MaskCmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpneq_epu64_mask'. Requires AVX512F.
func M512MaskCompressEpi32 ¶
M512MaskCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 32 m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[511:m] := src[511:m] dst[MAX:512] := 0
Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm512_mask_compress_epi32'. Requires AVX512F.
func M512MaskCompressEpi64 ¶
M512MaskCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 64 m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[511:m] := src[511:m] dst[MAX:512] := 0
Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm512_mask_compress_epi64'. Requires AVX512F.
func M512MaskCompressPd ¶
M512MaskCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 64 m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[511:m] := src[511:m] dst[MAX:512] := 0
Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm512_mask_compress_pd'. Requires AVX512F.
func M512MaskCompressPs ¶
M512MaskCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 32 m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[511:m] := src[511:m] dst[MAX:512] := 0
Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm512_mask_compress_ps'. Requires AVX512F.
func M512MaskCosPd ¶
M512MaskCosPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := COS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cos_pd'. Requires AVX512F.
func M512MaskCosPs ¶
M512MaskCosPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := COS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cos_ps'. Requires AVX512F.
func M512MaskCosdPd ¶
M512MaskCosdPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := COSD(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cosd_pd'. Requires AVX512F.
func M512MaskCosdPs ¶
M512MaskCosdPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := COSD(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cosd_ps'. Requires AVX512F.
func M512MaskCoshPd ¶
M512MaskCoshPd: Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := COSH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cosh_pd'. Requires AVX512F.
func M512MaskCoshPs ¶
M512MaskCoshPs: Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := COSH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_cosh_ps'. Requires AVX512F.
func M512MaskCvtRoundepi32Ps ¶
M512MaskCvtRoundepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_mask_cvt_roundepi32_ps'. Requires AVX512F.
func M512MaskCvtRoundepu32Ps ¶
M512MaskCvtRoundepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_mask_cvt_roundepu32_ps'. Requires AVX512F.
func M512MaskCvtRoundpdEpi32 ¶
func M512MaskCvtRoundpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)
M512MaskCvtRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_mask_cvt_roundpd_epi32'. Requires AVX512F.
func M512MaskCvtRoundpdEpu32 ¶
func M512MaskCvtRoundpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)
M512MaskCvtRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_mask_cvt_roundpd_epu32'. Requires AVX512F.
func M512MaskCvtRoundpdPs ¶
M512MaskCvtRoundpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_mask_cvt_roundpd_ps'. Requires AVX512F.
func M512MaskCvtRoundphPs ¶
M512MaskCvtRoundphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_mask_cvt_roundph_ps'. Requires AVX512F.
func M512MaskCvtRoundpsEpi32 ¶
func M512MaskCvtRoundpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)
M512MaskCvtRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_mask_cvt_roundps_epi32'. Requires AVX512F.
func M512MaskCvtRoundpsEpu32 ¶
func M512MaskCvtRoundpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)
M512MaskCvtRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_mask_cvt_roundps_epu32'. Requires AVX512F.
func M512MaskCvtRoundpsPd ¶
M512MaskCvtRoundpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_mask_cvt_roundps_pd'. Requires AVX512F.
func M512MaskCvtRoundpsPh ¶
M512MaskCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_mask_cvt_roundps_ph'. Requires AVX512F.
func M512MaskCvtepi16Epi32 ¶
M512MaskCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 l := j*16 IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXWD'. Intrinsic: '_mm512_mask_cvtepi16_epi32'. Requires AVX512F.
func M512MaskCvtepi16Epi64 ¶
M512MaskCvtepi16Epi64: Sign extend packed 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm512_mask_cvtepi16_epi64'. Requires AVX512F.
func M512MaskCvtepi32Epi16 ¶
M512MaskCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVDW'. Intrinsic: '_mm512_mask_cvtepi32_epi16'. Requires AVX512F.
func M512MaskCvtepi32Epi64 ¶
M512MaskCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm512_mask_cvtepi32_epi64'. Requires AVX512F.
func M512MaskCvtepi32Epi8 ¶
M512MaskCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVDB'. Intrinsic: '_mm512_mask_cvtepi32_epi8'. Requires AVX512F.
func M512MaskCvtepi32Pd ¶
M512MaskCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := src[m+63:m] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_mask_cvtepi32_pd'. Requires AVX512F.
func M512MaskCvtepi32Ps ¶
M512MaskCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_mask_cvtepi32_ps'. Requires AVX512F.
func M512MaskCvtepi64Epi16 ¶
M512MaskCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQW'. Intrinsic: '_mm512_mask_cvtepi64_epi16'. Requires AVX512F.
func M512MaskCvtepi64Epi32 ¶
M512MaskCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVQD'. Intrinsic: '_mm512_mask_cvtepi64_epi32'. Requires AVX512F.
func M512MaskCvtepi64Epi8 ¶
M512MaskCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQB'. Intrinsic: '_mm512_mask_cvtepi64_epi8'. Requires AVX512F.
func M512MaskCvtepi8Epi32 ¶
M512MaskCvtepi8Epi32: Sign extend packed 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXBD'. Intrinsic: '_mm512_mask_cvtepi8_epi32'. Requires AVX512F.
func M512MaskCvtepi8Epi64 ¶
M512MaskCvtepi8Epi64: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm512_mask_cvtepi8_epi64'. Requires AVX512F.
func M512MaskCvtepu16Epi32 ¶
M512MaskCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXWD'. Intrinsic: '_mm512_mask_cvtepu16_epi32'. Requires AVX512F.
func M512MaskCvtepu16Epi64 ¶
M512MaskCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm512_mask_cvtepu16_epi64'. Requires AVX512F.
func M512MaskCvtepu32Epi64 ¶
M512MaskCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm512_mask_cvtepu32_epi64'. Requires AVX512F.
func M512MaskCvtepu32Pd ¶
M512MaskCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_mask_cvtepu32_pd'. Requires AVX512F.
func M512MaskCvtepu32Ps ¶
M512MaskCvtepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_mask_cvtepu32_ps'. Requires AVX512F.
func M512MaskCvtepu8Epi32 ¶
M512MaskCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXBD'. Intrinsic: '_mm512_mask_cvtepu8_epi32'. Requires AVX512F.
func M512MaskCvtepu8Epi64 ¶
M512MaskCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm512_mask_cvtepu8_epi64'. Requires AVX512F.
func M512MaskCvtpdEpi32 ¶
M512MaskCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_mask_cvtpd_epi32'. Requires AVX512F.
func M512MaskCvtpdEpu32 ¶
M512MaskCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_mask_cvtpd_epu32'. Requires AVX512F.
func M512MaskCvtpdPs ¶
M512MaskCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_mask_cvtpd_ps'. Requires AVX512F.
func M512MaskCvtphPs ¶
M512MaskCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_mask_cvtph_ps'. Requires AVX512F.
func M512MaskCvtpsEpi32 ¶
M512MaskCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_mask_cvtps_epi32'. Requires AVX512F.
func M512MaskCvtpsEpu32 ¶
M512MaskCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_mask_cvtps_epu32'. Requires AVX512F.
func M512MaskCvtpsPd ¶
M512MaskCvtpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_mask_cvtps_pd'. Requires AVX512F.
func M512MaskCvtpsPh ¶
M512MaskCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_mask_cvtps_ph'. Requires AVX512F.
func M512MaskCvtsepi32Epi16 ¶
M512MaskCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSDW'. Intrinsic: '_mm512_mask_cvtsepi32_epi16'. Requires AVX512F.
func M512MaskCvtsepi32Epi8 ¶
M512MaskCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSDB'. Intrinsic: '_mm512_mask_cvtsepi32_epi8'. Requires AVX512F.
func M512MaskCvtsepi64Epi16 ¶
M512MaskCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSQW'. Intrinsic: '_mm512_mask_cvtsepi64_epi16'. Requires AVX512F.
func M512MaskCvtsepi64Epi32 ¶
M512MaskCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSQD'. Intrinsic: '_mm512_mask_cvtsepi64_epi32'. Requires AVX512F.
func M512MaskCvtsepi64Epi8 ¶
M512MaskCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSQB'. Intrinsic: '_mm512_mask_cvtsepi64_epi8'. Requires AVX512F.
func M512MaskCvttRoundpdEpi32 ¶
M512MaskCvttRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.
FOR j := 0 to 7 i := 32*i l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_mask_cvtt_roundpd_epi32'. Requires AVX512F.
func M512MaskCvttRoundpdEpu32 ¶
M512MaskCvttRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := 32*i l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_mask_cvtt_roundpd_epu32'. Requires AVX512F.
func M512MaskCvttRoundpsEpi32 ¶
M512MaskCvttRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_mask_cvtt_roundps_epi32'. Requires AVX512F.
func M512MaskCvttRoundpsEpu32 ¶
M512MaskCvttRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_mask_cvtt_roundps_epu32'. Requires AVX512F.
func M512MaskCvttpdEpi32 ¶
M512MaskCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_mask_cvttpd_epi32'. Requires AVX512F.
func M512MaskCvttpdEpu32 ¶
M512MaskCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_mask_cvttpd_epu32'. Requires AVX512F.
func M512MaskCvttpsEpi32 ¶
M512MaskCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_mask_cvttps_epi32'. Requires AVX512F.
func M512MaskCvttpsEpu32 ¶
M512MaskCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_mask_cvttps_epu32'. Requires AVX512F.
func M512MaskCvtusepi32Epi16 ¶
M512MaskCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVUSDW'. Intrinsic: '_mm512_mask_cvtusepi32_epi16'. Requires AVX512F.
func M512MaskCvtusepi32Epi8 ¶
M512MaskCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSDB'. Intrinsic: '_mm512_mask_cvtusepi32_epi8'. Requires AVX512F.
func M512MaskCvtusepi64Epi16 ¶
M512MaskCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSQW'. Intrinsic: '_mm512_mask_cvtusepi64_epi16'. Requires AVX512F.
func M512MaskCvtusepi64Epi32 ¶
M512MaskCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVUSQD'. Intrinsic: '_mm512_mask_cvtusepi64_epi32'. Requires AVX512F.
func M512MaskCvtusepi64Epi8 ¶
M512MaskCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSQB'. Intrinsic: '_mm512_mask_cvtusepi64_epi8'. Requires AVX512F.
func M512MaskDivEpi32 ¶
M512MaskDivEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_div_epi32'. Requires AVX512F.
func M512MaskDivEpu32 ¶
M512MaskDivEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_div_epu32'. Requires AVX512F.
func M512MaskDivPd ¶
M512MaskDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPD'. Intrinsic: '_mm512_mask_div_pd'. Requires AVX512F.
func M512MaskDivPs ¶
M512MaskDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPS'. Intrinsic: '_mm512_mask_div_ps'. Requires AVX512F.
func M512MaskDivRoundPd ¶
func M512MaskDivRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
M512MaskDivRoundPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPD'. Intrinsic: '_mm512_mask_div_round_pd'. Requires AVX512F.
func M512MaskDivRoundPs ¶
func M512MaskDivRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
M512MaskDivRoundPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPS'. Intrinsic: '_mm512_mask_div_round_ps'. Requires AVX512F.
func M512MaskErfPd ¶
M512MaskErfPd: Compute the error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ERF(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_erf_pd'. Requires AVX512F.
func M512MaskErfPs ¶
M512MaskErfPs: Compute the error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ERF(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_erf_ps'. Requires AVX512F.
func M512MaskErfcPd ¶
M512MaskErfcPd: Compute the complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 1.0 - ERF(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_erfc_pd'. Requires AVX512F.
func M512MaskErfcPs ¶
M512MaskErfcPs: Compute the complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 1.0 - ERF(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_erfc_ps'. Requires AVX512F.
func M512MaskErfcinvPd ¶
M512MaskErfcinvPd: Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_erfcinv_pd'. Requires AVX512F.
func M512MaskErfcinvPs ¶
M512MaskErfcinvPs: Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_erfcinv_ps'. Requires AVX512F.
func M512MaskErfinvPd ¶
M512MaskErfinvPd: Compute the inverse error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 1.0 / ERF(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_erfinv_pd'. Requires AVX512F.
func M512MaskErfinvPs ¶
M512MaskErfinvPs: Compute the inverse error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 1.0 / ERF(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_erfinv_ps'. Requires AVX512F.
func M512MaskExp10Pd ¶
M512MaskExp10Pd: Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 10^(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_exp10_pd'. Requires AVX512F.
func M512MaskExp10Ps ¶
M512MaskExp10Ps: Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 10^(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_exp10_ps'. Requires AVX512F.
func M512MaskExp2Pd ¶
M512MaskExp2Pd: Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := 2^(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_exp2_pd'. Requires AVX512F.
func M512MaskExp2Ps ¶
M512MaskExp2Ps: Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := 2^(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_exp2_ps'. Requires AVX512F.
func M512MaskExpPd ¶
M512MaskExpPd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := e^(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_exp_pd'. Requires AVX512F.
func M512MaskExpPs ¶
M512MaskExpPs: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := e^(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_exp_ps'. Requires AVX512F.
func M512MaskExpandEpi32 ¶
M512MaskExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPEXPANDD'. Intrinsic: '_mm512_mask_expand_epi32'. Requires AVX512F.
func M512MaskExpandEpi64 ¶
M512MaskExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPEXPANDQ'. Intrinsic: '_mm512_mask_expand_epi64'. Requires AVX512F.
func M512MaskExpandPd ¶
M512MaskExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VEXPANDPD'. Intrinsic: '_mm512_mask_expand_pd'. Requires AVX512F.
func M512MaskExpandPs ¶
M512MaskExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VEXPANDPS'. Intrinsic: '_mm512_mask_expand_ps'. Requires AVX512F.
func M512MaskExpm1Pd ¶
M512MaskExpm1Pd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := e^(a[i+63:i]) - 1.0 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_expm1_pd'. Requires AVX512F.
func M512MaskExpm1Ps ¶
M512MaskExpm1Ps: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := e^(a[i+31:i]) - 1.0 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_expm1_ps'. Requires AVX512F.
func M512MaskExtractf32x4Ps ¶
M512MaskExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] 2: tmp[127:0] := a[383:256] 3: tmp[127:0] := a[511:384] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm512_mask_extractf32x4_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskExtractf64x4Pd ¶
M512MaskExtractf64x4Pd: Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: tmp[255:0] := a[255:0] 1: tmp[255:0] := a[511:256] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VEXTRACTF64X4'. Intrinsic: '_mm512_mask_extractf64x4_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskExtracti32x4Epi32 ¶
M512MaskExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] 2: dst[127:0] := a[383:256] 3: dst[127:0] := a[511:384] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm512_mask_extracti32x4_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskExtracti64x4Epi64 ¶
M512MaskExtracti64x4Epi64: Extract 256 bits (composed of 4 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: dst[255:0] := a[255:0] 1: dst[255:0] := a[511:256] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:256] := 0
Instruction: 'VEXTRACTI64X4'. Intrinsic: '_mm512_mask_extracti64x4_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskFixupimmPd ¶
func M512MaskFixupimmPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)
M512MaskFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_mask_fixupimm_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskFixupimmPs ¶
func M512MaskFixupimmPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)
M512MaskFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_mask_fixupimm_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskFixupimmRoundPd ¶
func M512MaskFixupimmRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)
M512MaskFixupimmRoundPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_mask_fixupimm_round_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskFixupimmRoundPs ¶
func M512MaskFixupimmRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)
M512MaskFixupimmRoundPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_mask_fixupimm_round_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskFloorPd ¶
M512MaskFloorPd: Round the packed double-precision (64-bit) floating-point elements in 'a' down to an integer value, and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FLOOR(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_floor_pd'. Requires AVX512F.
func M512MaskFloorPs ¶
M512MaskFloorPs: Round the packed single-precision (32-bit) floating-point elements in 'a' down to an integer value, and store the results as packed single-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FLOOR(a[i+31:i]) ELSE dst[i+31:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_floor_ps'. Requires AVX512F.
func M512MaskFmaddsubPd ¶
M512MaskFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask_fmaddsub_pd'. Requires AVX512F.
func M512MaskFmaddsubPs ¶
M512MaskFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask_fmaddsub_ps'. Requires AVX512F.
func M512MaskFmaddsubRoundPd ¶
func M512MaskFmaddsubRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
M512MaskFmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask_fmaddsub_round_pd'. Requires AVX512F.
func M512MaskFmaddsubRoundPs ¶
func M512MaskFmaddsubRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
M512MaskFmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask_fmaddsub_round_ps'. Requires AVX512F.
func M512MaskFmsubaddPd ¶
M512MaskFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask_fmsubadd_pd'. Requires AVX512F.
func M512MaskFmsubaddPs ¶
M512MaskFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask_fmsubadd_ps'. Requires AVX512F.
func M512MaskFmsubaddRoundPd ¶
func M512MaskFmsubaddRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
M512MaskFmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask_fmsubadd_round_pd'. Requires AVX512F.
func M512MaskFmsubaddRoundPs ¶
func M512MaskFmsubaddRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
M512MaskFmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask_fmsubadd_round_ps'. Requires AVX512F.
func M512MaskHypotPd ¶
M512MaskHypotPd: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_hypot_pd'. Requires AVX512F.
func M512MaskHypotPs ¶
M512MaskHypotPs: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_hypot_ps'. Requires AVX512F.
func M512MaskInsertf32x4 ¶
func M512MaskInsertf32x4(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)
M512MaskInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VINSERTF32X4'. Intrinsic: '_mm512_mask_insertf32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskInsertf64x4 ¶
func M512MaskInsertf64x4(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)
M512MaskInsertf64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[511:0] := a[511:0] CASE (imm8[0]) of 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VINSERTF64X4'. Intrinsic: '_mm512_mask_insertf64x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskInserti32x4 ¶
func M512MaskInserti32x4(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)
M512MaskInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VINSERTI32X4'. Intrinsic: '_mm512_mask_inserti32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskInserti64x4 ¶
func M512MaskInserti64x4(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)
M512MaskInserti64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[511:0] := a[511:0] CASE (imm8[0]) of 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VINSERTI64X4'. Intrinsic: '_mm512_mask_inserti64x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskInvsqrtPd ¶
M512MaskInvsqrtPd: Compute the inverse square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := InvSQRT(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_invsqrt_pd'. Requires AVX512F.
func M512MaskInvsqrtPs ¶
M512MaskInvsqrtPs: Compute the inverse square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := InvSQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_invsqrt_ps'. Requires AVX512F.
func M512MaskLog10Pd ¶
M512MaskLog10Pd: Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := log10(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_log10_pd'. Requires AVX512F.
func M512MaskLog10Ps ¶
M512MaskLog10Ps: Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := log10(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_log10_ps'. Requires AVX512F.
func M512MaskLog1pPd ¶
M512MaskLog1pPd: Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ln(1.0 + a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_log1p_pd'. Requires AVX512F.
func M512MaskLog1pPs ¶
M512MaskLog1pPs: Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ln(1.0 + a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_log1p_ps'. Requires AVX512F.
func M512MaskLog2Pd ¶
M512MaskLog2Pd: Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := log2(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_log2_pd'. Requires AVX512F.
func M512MaskLogPd ¶
M512MaskLogPd: Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ln(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_log_pd'. Requires AVX512F.
func M512MaskLogPs ¶
M512MaskLogPs: Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ln(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_log_ps'. Requires AVX512F.
func M512MaskLogbPd ¶
M512MaskLogbPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_logb_pd'. Requires AVX512F.
func M512MaskLogbPs ¶
M512MaskLogbPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_logb_ps'. Requires AVX512F.
func M512MaskMaxEpi64 ¶
M512MaskMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMAXSQ'. Intrinsic: '_mm512_mask_max_epi64'. Requires AVX512F.
func M512MaskMaxEpu64 ¶
M512MaskMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMAXUQ'. Intrinsic: '_mm512_mask_max_epu64'. Requires AVX512F.
func M512MaskMaxPd ¶
M512MaskMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPD'. Intrinsic: '_mm512_mask_max_pd'. Requires AVX512F.
func M512MaskMaxPs ¶
M512MaskMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPS'. Intrinsic: '_mm512_mask_max_ps'. Requires AVX512F.
func M512MaskMaxRoundPd ¶
func M512MaskMaxRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
M512MaskMaxRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPD'. Intrinsic: '_mm512_mask_max_round_pd'. Requires AVX512F.
func M512MaskMaxRoundPs ¶
func M512MaskMaxRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)
M512MaskMaxRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPS'. Intrinsic: '_mm512_mask_max_round_ps'. Requires AVX512F.
func M512MaskMinEpi64 ¶
M512MaskMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMINSQ'. Intrinsic: '_mm512_mask_min_epi64'. Requires AVX512F.
func M512MaskMinEpu64 ¶
M512MaskMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMINUQ'. Intrinsic: '_mm512_mask_min_epu64'. Requires AVX512F.
func M512MaskMinPd ¶
M512MaskMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPD'. Intrinsic: '_mm512_mask_min_pd'. Requires AVX512F.
func M512MaskMinPs ¶
M512MaskMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPS'. Intrinsic: '_mm512_mask_min_ps'. Requires AVX512F.
func M512MaskMinRoundPd ¶
func M512MaskMinRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
M512MaskMinRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPD'. Intrinsic: '_mm512_mask_min_round_pd'. Requires AVX512F.
func M512MaskMinRoundPs ¶
func M512MaskMinRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)
M512MaskMinRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPS'. Intrinsic: '_mm512_mask_min_round_ps'. Requires AVX512F.
func M512MaskMovedupPd ¶
M512MaskMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] tmp[191:128] := a[191:128] tmp[255:192] := a[191:128] tmp[319:256] := a[319:256] tmp[383:320] := a[319:256] tmp[447:384] := a[447:384] tmp[511:448] := a[447:384] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMOVDDUP'. Intrinsic: '_mm512_mask_movedup_pd'. Requires AVX512F.
func M512MaskMovehdupPs ¶
M512MaskMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] tmp[159:128] := a[191:160] tmp[191:160] := a[191:160] tmp[223:192] := a[255:224] tmp[255:224] := a[255:224] tmp[287:256] := a[319:288] tmp[319:288] := a[319:288] tmp[351:320] := a[383:352] tmp[383:352] := a[383:352] tmp[415:384] := a[447:416] tmp[447:416] := a[447:416] tmp[479:448] := a[511:480] tmp[511:480] := a[511:480] FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMOVSHDUP'. Intrinsic: '_mm512_mask_movehdup_ps'. Requires AVX512F.
func M512MaskMoveldupPs ¶
M512MaskMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] tmp[159:128] := a[159:128] tmp[191:160] := a[159:128] tmp[223:192] := a[223:192] tmp[255:224] := a[223:192] tmp[287:256] := a[287:256] tmp[319:288] := a[287:256] tmp[351:320] := a[351:320] tmp[383:352] := a[351:320] tmp[415:384] := a[415:384] tmp[447:416] := a[415:384] tmp[479:448] := a[479:448] tmp[511:480] := a[479:448] FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMOVSLDUP'. Intrinsic: '_mm512_mask_moveldup_ps'. Requires AVX512F.
func M512MaskMulEpi32 ¶
M512MaskMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMULDQ'. Intrinsic: '_mm512_mask_mul_epi32'. Requires AVX512F.
func M512MaskMulEpu32 ¶
M512MaskMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMULUDQ'. Intrinsic: '_mm512_mask_mul_epu32'. Requires AVX512F.
func M512MaskMulloxEpi64 ¶
M512MaskMulloxEpi64: Multiplies elements in packed 64-bit integer vectors 'a' and 'b' together, storing the lower 64 bits of the result in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_mullox_epi64'. Requires AVX512F.
func M512MaskNearbyintPd ¶
M512MaskNearbyintPd: Rounds each packed double-precision (64-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := NearbyInt(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_nearbyint_pd'. Requires AVX512F.
func M512MaskNearbyintPs ¶
M512MaskNearbyintPs: Rounds each packed single-precision (32-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := NearbyInt(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_nearbyint_ps'. Requires AVX512F.
func M512MaskPermutePd ¶
M512MaskPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128] IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192] IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128] IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192] IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256] IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320] IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256] IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320] IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384] IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448] IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384] IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm512_mask_permute_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskPermutePs ¶
M512MaskPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm512_mask_permute_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskPermutevarPd ¶
M512MaskPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] IF (b[129] == 0) tmp_dst[191:128] := a[191:128] IF (b[129] == 1) tmp_dst[191:128] := a[255:192] IF (b[193] == 0) tmp_dst[255:192] := a[191:128] IF (b[193] == 1) tmp_dst[255:192] := a[255:192] IF (b[257] == 0) tmp_dst[319:256] := a[319:256] IF (b[257] == 1) tmp_dst[319:256] := a[383:320] IF (b[321] == 0) tmp_dst[383:320] := a[319:256] IF (b[321] == 1) tmp_dst[383:320] := a[383:320] IF (b[385] == 0) tmp_dst[447:384] := a[447:384] IF (b[385] == 1) tmp_dst[447:384] := a[511:448] IF (b[449] == 0) tmp_dst[511:448] := a[447:384] IF (b[449] == 1) tmp_dst[511:448] := a[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm512_mask_permutevar_pd'. Requires AVX512F.
func M512MaskPermutevarPs ¶
M512MaskPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm512_mask_permutevar_ps'. Requires AVX512F.
func M512MaskPermutex2varEpi32 ¶
func M512MaskPermutex2varEpi32(a x86.M512i, k x86.Mmask16, idx x86.M512i, b x86.M512i) (dst x86.M512i)
M512MaskPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMT2D'. Intrinsic: '_mm512_mask_permutex2var_epi32'. Requires AVX512F.
func M512MaskPermutex2varEpi64 ¶
func M512MaskPermutex2varEpi64(a x86.M512i, k x86.Mmask8, idx x86.M512i, b x86.M512i) (dst x86.M512i)
M512MaskPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMT2Q'. Intrinsic: '_mm512_mask_permutex2var_epi64'. Requires AVX512F.
func M512MaskPermutex2varPd ¶
M512MaskPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMT2PD'. Intrinsic: '_mm512_mask_permutex2var_pd'. Requires AVX512F.
func M512MaskPermutex2varPs ¶
M512MaskPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMT2PS'. Intrinsic: '_mm512_mask_permutex2var_ps'. Requires AVX512F.
func M512MaskPermutexEpi64 ¶
M512MaskPermutexEpi64: Shuffle 64-bit integers in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm512_mask_permutex_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskPermutexPd ¶
M512MaskPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm512_mask_permutex_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskPermutexvarEpi32 ¶
func M512MaskPermutexvarEpi32(src x86.M512i, k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)
M512MaskPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMD'. Intrinsic: '_mm512_mask_permutexvar_epi32'. Requires AVX512F.
func M512MaskPermutexvarEpi64 ¶
func M512MaskPermutexvarEpi64(src x86.M512i, k x86.Mmask8, idx x86.M512i, a x86.M512i) (dst x86.M512i)
M512MaskPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm512_mask_permutexvar_epi64'. Requires AVX512F.
func M512MaskPermutexvarPd ¶
M512MaskPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm512_mask_permutexvar_pd'. Requires AVX512F.
func M512MaskPermutexvarPs ¶
M512MaskPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMPS'. Intrinsic: '_mm512_mask_permutexvar_ps'. Requires AVX512F.
func M512MaskPowPd ¶
M512MaskPowPd: Compute the exponential value of packed double-precision (64-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i])^(b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_pow_pd'. Requires AVX512F.
func M512MaskPowPs ¶
M512MaskPowPs: Compute the exponential value of packed single-precision (32-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i])^(b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_pow_ps'. Requires AVX512F.
func M512MaskRcp14Pd ¶
M512MaskRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRCP14PD'. Intrinsic: '_mm512_mask_rcp14_pd'. Requires AVX512F.
func M512MaskRcp14Ps ¶
M512MaskRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRCP14PS'. Intrinsic: '_mm512_mask_rcp14_ps'. Requires AVX512F.
func M512MaskRecipPd ¶
M512MaskRecipPd: Computes the reciprocal of packed double-precision (64-bit) floating-point elements in 'a', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (1 / a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_recip_pd'. Requires AVX512F.
func M512MaskRecipPs ¶
M512MaskRecipPs: Computes the reciprocal of packed single-precision (32-bit) floating-point elements in 'a', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (1 / a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_recip_ps'. Requires AVX512F.
func M512MaskRemEpi32 ¶
M512MaskRemEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_rem_epi32'. Requires AVX512F.
func M512MaskRemEpu32 ¶
M512MaskRemEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_rem_epu32'. Requires AVX512F.
func M512MaskRintPd ¶
M512MaskRintPd: Rounds the packed double-precision (64-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RoundToNearestEven(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_rint_pd'. Requires AVX512F.
func M512MaskRintPs ¶
M512MaskRintPs: Rounds the packed single-precision (32-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RoundToNearestEven(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_rint_ps'. Requires AVX512F.
func M512MaskRolEpi32 ¶
M512MaskRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLD'. Intrinsic: '_mm512_mask_rol_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskRolEpi64 ¶
M512MaskRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLQ'. Intrinsic: '_mm512_mask_rol_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskRolvEpi32 ¶
M512MaskRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLVD'. Intrinsic: '_mm512_mask_rolv_epi32'. Requires AVX512F.
func M512MaskRolvEpi64 ¶
M512MaskRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLVQ'. Intrinsic: '_mm512_mask_rolv_epi64'. Requires AVX512F.
func M512MaskRorEpi32 ¶
M512MaskRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORD'. Intrinsic: '_mm512_mask_ror_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskRorEpi64 ¶
M512MaskRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORQ'. Intrinsic: '_mm512_mask_ror_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskRorvEpi32 ¶
M512MaskRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORVD'. Intrinsic: '_mm512_mask_rorv_epi32'. Requires AVX512F.
func M512MaskRorvEpi64 ¶
M512MaskRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORVQ'. Intrinsic: '_mm512_mask_rorv_epi64'. Requires AVX512F.
func M512MaskRoundscalePd ¶
M512MaskRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_mask_roundscale_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskRoundscalePs ¶
M512MaskRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_mask_roundscale_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskRoundscaleRoundPd ¶
func M512MaskRoundscaleRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
M512MaskRoundscaleRoundPd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_mask_roundscale_round_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskRoundscaleRoundPs ¶
func M512MaskRoundscaleRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)
M512MaskRoundscaleRoundPs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_mask_roundscale_round_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskRsqrt14Pd ¶
M512MaskRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRSQRT14PD'. Intrinsic: '_mm512_mask_rsqrt14_pd'. Requires AVX512F.
func M512MaskRsqrt14Ps ¶
M512MaskRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRSQRT14PS'. Intrinsic: '_mm512_mask_rsqrt14_ps'. Requires AVX512F.
func M512MaskScalefPd ¶
M512MaskScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_mask_scalef_pd'. Requires AVX512F.
func M512MaskScalefPs ¶
M512MaskScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_mask_scalef_ps'. Requires AVX512F.
func M512MaskScalefRoundPd ¶
func M512MaskScalefRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
M512MaskScalefRoundPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_mask_scalef_round_pd'. Requires AVX512F.
func M512MaskScalefRoundPs ¶
func M512MaskScalefRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
M512MaskScalefRoundPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_mask_scalef_round_ps'. Requires AVX512F.
func M512MaskSet1Epi32 ¶
M512MaskSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_mask_set1_epi32'. Requires AVX512F.
func M512MaskSet1Epi64 ¶
M512MaskSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_mask_set1_epi64'. Requires AVX512F.
func M512MaskShuffleF32x4 ¶
func M512MaskShuffleF32x4(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
M512MaskShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFF32X4'. Intrinsic: '_mm512_mask_shuffle_f32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskShuffleF64x2 ¶
func M512MaskShuffleF64x2(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
M512MaskShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFF64X2'. Intrinsic: '_mm512_mask_shuffle_f64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskShuffleI32x4 ¶
func M512MaskShuffleI32x4(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
M512MaskShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFI32X4'. Intrinsic: '_mm512_mask_shuffle_i32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskShuffleI64x2 ¶
func M512MaskShuffleI64x2(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
M512MaskShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFI64X2'. Intrinsic: '_mm512_mask_shuffle_i64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskShufflePd ¶
func M512MaskShufflePd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
M512MaskShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFPD'. Intrinsic: '_mm512_mask_shuffle_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskShufflePs ¶
func M512MaskShufflePs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
M512MaskShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFPS'. Intrinsic: '_mm512_mask_shuffle_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskSinPd ¶
M512MaskSinPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SIN(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_sin_pd'. Requires AVX512F.
func M512MaskSinPs ¶
M512MaskSinPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SIN(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_sin_ps'. Requires AVX512F.
func M512MaskSincosPd ¶
func M512MaskSincosPd(cos_res *x86.M512d, sin_src x86.M512d, cos_src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
M512MaskSincosPd: Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'. Elements are written to their respective locations using writemask 'k' (elements are copied from 'sin_src' or 'cos_src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SIN(a[i+63:i]) cos_res[i+63:i] := COS(a[i+63:i]) ELSE dst[i+63:i] := sin_src[i+63:i] cos_res[i+63:i] := cos_src[i+63:i] FI ENDFOR dst[MAX:512] := 0 cos_res[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_sincos_pd'. Requires AVX512F.
FIXME: Will likely need to be reworked (has pointer parameter).
func M512MaskSincosPs ¶
func M512MaskSincosPs(cos_res *x86.M512, sin_src x86.M512, cos_src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
M512MaskSincosPs: Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'. Elements are written to their respective locations using writemask 'k' (elements are copied from 'sin_src' or 'cos_src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SIN(a[i+31:i]) cos_res[i+31:i] := COS(a[i+31:i]) ELSE dst[i+31:i] := sin_src[i+31:i] cos_res[i+31:i] := cos_src[i+31:i] FI ENDFOR dst[MAX:512] := 0 cos_res[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_sincos_ps'. Requires AVX512F.
FIXME: Will likely need to be reworked (has pointer parameter).
func M512MaskSindPd ¶
M512MaskSindPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SIND(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_sind_pd'. Requires AVX512F.
func M512MaskSindPs ¶
M512MaskSindPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SIND(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_sind_ps'. Requires AVX512F.
func M512MaskSinhPd ¶
M512MaskSinhPd: Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SINH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_sinh_pd'. Requires AVX512F.
func M512MaskSinhPs ¶
M512MaskSinhPs: Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SINH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_sinh_ps'. Requires AVX512F.
func M512MaskSllEpi32 ¶
M512MaskSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm512_mask_sll_epi32'. Requires AVX512F.
func M512MaskSllEpi64 ¶
M512MaskSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm512_mask_sll_epi64'. Requires AVX512F.
func M512MaskSlliEpi64 ¶
M512MaskSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm512_mask_slli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskSllvEpi64 ¶
M512MaskSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLVQ'. Intrinsic: '_mm512_mask_sllv_epi64'. Requires AVX512F.
func M512MaskSqrtPd ¶
M512MaskSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPD'. Intrinsic: '_mm512_mask_sqrt_pd'. Requires AVX512F.
func M512MaskSqrtPs ¶
M512MaskSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPS'. Intrinsic: '_mm512_mask_sqrt_ps'. Requires AVX512F.
func M512MaskSqrtRoundPd ¶
M512MaskSqrtRoundPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPD'. Intrinsic: '_mm512_mask_sqrt_round_pd'. Requires AVX512F.
func M512MaskSqrtRoundPs ¶
M512MaskSqrtRoundPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPS'. Intrinsic: '_mm512_mask_sqrt_round_ps'. Requires AVX512F.
func M512MaskSraEpi32 ¶
M512MaskSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm512_mask_sra_epi32'. Requires AVX512F.
func M512MaskSraEpi64 ¶
M512MaskSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm512_mask_sra_epi64'. Requires AVX512F.
func M512MaskSraiEpi64 ¶
M512MaskSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm512_mask_srai_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskSravEpi64 ¶
M512MaskSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAVQ'. Intrinsic: '_mm512_mask_srav_epi64'. Requires AVX512F.
func M512MaskSrlEpi32 ¶
M512MaskSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm512_mask_srl_epi32'. Requires AVX512F.
func M512MaskSrlEpi64 ¶
M512MaskSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm512_mask_srl_epi64'. Requires AVX512F.
func M512MaskSrliEpi64 ¶
M512MaskSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm512_mask_srli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskSrlvEpi64 ¶
M512MaskSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLVQ'. Intrinsic: '_mm512_mask_srlv_epi64'. Requires AVX512F.
func M512MaskSubEpi64 ¶
M512MaskSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSUBQ'. Intrinsic: '_mm512_mask_sub_epi64'. Requires AVX512F.
func M512MaskSvmlRoundPd ¶
M512MaskSvmlRoundPd: Round the packed double-precision (64-bit) floating-point elements in 'a' to the nearest integer value, and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ROUND(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_svml_round_pd'. Requires AVX512F.
func M512MaskTanPd ¶
M512MaskTanPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := TAN(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_tan_pd'. Requires AVX512F.
func M512MaskTanPs ¶
M512MaskTanPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := TAN(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_tan_ps'. Requires AVX512F.
func M512MaskTandPd ¶
M512MaskTandPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := TAND(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_tand_pd'. Requires AVX512F.
func M512MaskTandPs ¶
M512MaskTandPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := TAND(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_tand_ps'. Requires AVX512F.
func M512MaskTanhPd ¶
M512MaskTanhPd: Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := TANH(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_tanh_pd'. Requires AVX512F.
func M512MaskTanhPs ¶
M512MaskTanhPs: Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := TANH(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_tanh_ps'. Requires AVX512F.
func M512MaskTernarylogicEpi32 ¶
func M512MaskTernarylogicEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
M512MaskTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 32-bit granularity (32-bit elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPTERNLOGD'. Intrinsic: '_mm512_mask_ternarylogic_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskTernarylogicEpi64 ¶
func M512MaskTernarylogicEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
M512MaskTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 64-bit granularity (64-bit elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm512_mask_ternarylogic_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskTestEpi64Mask ¶
M512MaskTestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPTESTMQ'. Intrinsic: '_mm512_mask_test_epi64_mask'. Requires AVX512F.
func M512MaskTestnEpi32Mask ¶
M512MaskTestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.
FOR j := 0 to 15 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:16] := 0
Instruction: 'VPTESTNMD'. Intrinsic: '_mm512_mask_testn_epi32_mask'. Requires AVX512F.
func M512MaskTestnEpi64Mask ¶
M512MaskTestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.
FOR j := 0 to 7 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:8] := 0
Instruction: 'VPTESTNMQ'. Intrinsic: '_mm512_mask_testn_epi64_mask'. Requires AVX512F.
func M512MaskTruncPd ¶
M512MaskTruncPd: Truncate the packed double-precision (64-bit) floating-point elements in 'a', and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := TRUNCATE(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_trunc_pd'. Requires AVX512F.
func M512MaskTruncPs ¶
M512MaskTruncPs: Truncate the packed single-precision (32-bit) floating-point elements in 'a', and store the results as packed single-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := TRUNCATE(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mask_trunc_ps'. Requires AVX512F.
func M512MaskUnpackhiEpi32 ¶
M512MaskUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm512_mask_unpackhi_epi32'. Requires AVX512F.
func M512MaskUnpackhiEpi64 ¶
M512MaskUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm512_mask_unpackhi_epi64'. Requires AVX512F.
func M512MaskUnpackhiPd ¶
M512MaskUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VUNPCKHPD'. Intrinsic: '_mm512_mask_unpackhi_pd'. Requires AVX512F.
func M512MaskUnpackhiPs ¶
M512MaskUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VUNPCKHPS'. Intrinsic: '_mm512_mask_unpackhi_ps'. Requires AVX512F.
func M512MaskUnpackloEpi32 ¶
M512MaskUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm512_mask_unpacklo_epi32'. Requires AVX512F.
func M512MaskUnpackloEpi64 ¶
M512MaskUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm512_mask_unpacklo_epi64'. Requires AVX512F.
func M512MaskUnpackloPd ¶
M512MaskUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VUNPCKLPD'. Intrinsic: '_mm512_mask_unpacklo_pd'. Requires AVX512F.
func M512MaskUnpackloPs ¶
M512MaskUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VUNPCKLPS'. Intrinsic: '_mm512_mask_unpacklo_ps'. Requires AVX512F.
func M512MaskzAbsEpi32 ¶
M512MaskzAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPABSD'. Intrinsic: '_mm512_maskz_abs_epi32'. Requires AVX512F.
func M512MaskzAbsEpi64 ¶
M512MaskzAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPABSQ'. Intrinsic: '_mm512_maskz_abs_epi64'. Requires AVX512F.
func M512MaskzAddEpi32 ¶
M512MaskzAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPADDD'. Intrinsic: '_mm512_maskz_add_epi32'. Requires AVX512F.
func M512MaskzAddEpi64 ¶
M512MaskzAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPADDQ'. Intrinsic: '_mm512_maskz_add_epi64'. Requires AVX512F.
func M512MaskzAddPd ¶
M512MaskzAddPd: Add packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VADDPD'. Intrinsic: '_mm512_maskz_add_pd'. Requires AVX512F.
func M512MaskzAddPs ¶
M512MaskzAddPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VADDPS'. Intrinsic: '_mm512_maskz_add_ps'. Requires AVX512F.
func M512MaskzAddRoundPd ¶
M512MaskzAddRoundPd: Add packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VADDPD'. Intrinsic: '_mm512_maskz_add_round_pd'. Requires AVX512F.
func M512MaskzAddRoundPs ¶
M512MaskzAddRoundPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VADDPS'. Intrinsic: '_mm512_maskz_add_round_ps'. Requires AVX512F.
func M512MaskzAlignrEpi32 ¶
M512MaskzAlignrEpi32: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 32-bit elements, and stores the low 64 bytes (16 elements) in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
temp[1023:512] := a[511:0] temp[511:0] := b[511:0] temp[1023:0] := temp[1023:0] >> (32*count) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := temp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VALIGND'. Intrinsic: '_mm512_maskz_alignr_epi32'. Requires AVX512F.
func M512MaskzAlignrEpi64 ¶
M512MaskzAlignrEpi64: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 64-bit elements, and stores the low 64 bytes (8 elements) in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
temp[1023:512] := a[511:0] temp[511:0] := b[511:0] temp[1023:0] := temp[1023:0] >> (64*count) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := temp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VALIGNQ'. Intrinsic: '_mm512_maskz_alignr_epi64'. Requires AVX512F.
func M512MaskzAndEpi32 ¶
M512MaskzAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPANDD'. Intrinsic: '_mm512_maskz_and_epi32'. Requires AVX512F.
func M512MaskzAndEpi64 ¶
M512MaskzAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPANDQ'. Intrinsic: '_mm512_maskz_and_epi64'. Requires AVX512F.
func M512MaskzAndnotEpi32 ¶
M512MaskzAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPANDND'. Intrinsic: '_mm512_maskz_andnot_epi32'. Requires AVX512F.
func M512MaskzAndnotEpi64 ¶
M512MaskzAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPANDNQ'. Intrinsic: '_mm512_maskz_andnot_epi64'. Requires AVX512F.
func M512MaskzBroadcastF32x4 ¶
M512MaskzBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm512_maskz_broadcast_f32x4'. Requires AVX512F.
func M512MaskzBroadcastF64x4 ¶
M512MaskzBroadcastF64x4: Broadcast the 4 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTF64X4'. Intrinsic: '_mm512_maskz_broadcast_f64x4'. Requires AVX512F.
func M512MaskzBroadcastI32x4 ¶
M512MaskzBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 n := (j mod 4)*32 IF k[j] dst[i+31:i] := a[n+31:n] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm512_maskz_broadcast_i32x4'. Requires AVX512F.
func M512MaskzBroadcastI64x4 ¶
M512MaskzBroadcastI64x4: Broadcast the 4 packed 64-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 n := (j mod 4)*64 IF k[j] dst[i+63:i] := a[n+63:n] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTI64X4'. Intrinsic: '_mm512_maskz_broadcast_i64x4'. Requires AVX512F.
func M512MaskzBroadcastdEpi32 ¶
M512MaskzBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_maskz_broadcastd_epi32'. Requires AVX512F.
func M512MaskzBroadcastqEpi64 ¶
M512MaskzBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_maskz_broadcastq_epi64'. Requires AVX512F.
func M512MaskzBroadcastsdPd ¶
M512MaskzBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTSD'. Intrinsic: '_mm512_maskz_broadcastsd_pd'. Requires AVX512F.
func M512MaskzBroadcastssPs ¶
M512MaskzBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VBROADCASTSS'. Intrinsic: '_mm512_maskz_broadcastss_ps'. Requires AVX512F.
func M512MaskzCompressEpi32 ¶
M512MaskzCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 32 m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[511:m] := 0 dst[MAX:512] := 0
Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm512_maskz_compress_epi32'. Requires AVX512F.
func M512MaskzCompressEpi64 ¶
M512MaskzCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 64 m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[511:m] := 0 dst[MAX:512] := 0
Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm512_maskz_compress_epi64'. Requires AVX512F.
func M512MaskzCompressPd ¶
M512MaskzCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 64 m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[511:m] := 0 dst[MAX:512] := 0
Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm512_maskz_compress_pd'. Requires AVX512F.
func M512MaskzCompressPs ¶
M512MaskzCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 32 m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[511:m] := 0 dst[MAX:512] := 0
Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm512_maskz_compress_ps'. Requires AVX512F.
func M512MaskzCvtRoundepi32Ps ¶
M512MaskzCvtRoundepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_maskz_cvt_roundepi32_ps'. Requires AVX512F.
func M512MaskzCvtRoundepu32Ps ¶
M512MaskzCvtRoundepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_maskz_cvt_roundepu32_ps'. Requires AVX512F.
func M512MaskzCvtRoundpdEpi32 ¶
M512MaskzCvtRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_maskz_cvt_roundpd_epi32'. Requires AVX512F.
func M512MaskzCvtRoundpdEpu32 ¶
M512MaskzCvtRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_maskz_cvt_roundpd_epu32'. Requires AVX512F.
func M512MaskzCvtRoundpdPs ¶
M512MaskzCvtRoundpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_maskz_cvt_roundpd_ps'. Requires AVX512F.
func M512MaskzCvtRoundphPs ¶
M512MaskzCvtRoundphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_maskz_cvt_roundph_ps'. Requires AVX512F.
func M512MaskzCvtRoundpsEpi32 ¶
M512MaskzCvtRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_maskz_cvt_roundps_epi32'. Requires AVX512F.
func M512MaskzCvtRoundpsEpu32 ¶
M512MaskzCvtRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_maskz_cvt_roundps_epu32'. Requires AVX512F.
func M512MaskzCvtRoundpsPd ¶
M512MaskzCvtRoundpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_maskz_cvt_roundps_pd'. Requires AVX512F.
func M512MaskzCvtRoundpsPh ¶
M512MaskzCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_maskz_cvt_roundps_ph'. Requires AVX512F.
func M512MaskzCvtepi16Epi32 ¶
M512MaskzCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXWD'. Intrinsic: '_mm512_maskz_cvtepi16_epi32'. Requires AVX512F.
func M512MaskzCvtepi16Epi64 ¶
M512MaskzCvtepi16Epi64: Sign extend packed 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm512_maskz_cvtepi16_epi64'. Requires AVX512F.
func M512MaskzCvtepi32Epi16 ¶
M512MaskzCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVDW'. Intrinsic: '_mm512_maskz_cvtepi32_epi16'. Requires AVX512F.
func M512MaskzCvtepi32Epi64 ¶
M512MaskzCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm512_maskz_cvtepi32_epi64'. Requires AVX512F.
func M512MaskzCvtepi32Epi8 ¶
M512MaskzCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVDB'. Intrinsic: '_mm512_maskz_cvtepi32_epi8'. Requires AVX512F.
func M512MaskzCvtepi32Pd ¶
M512MaskzCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_maskz_cvtepi32_pd'. Requires AVX512F.
func M512MaskzCvtepi32Ps ¶
M512MaskzCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_maskz_cvtepi32_ps'. Requires AVX512F.
func M512MaskzCvtepi64Epi16 ¶
M512MaskzCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQW'. Intrinsic: '_mm512_maskz_cvtepi64_epi16'. Requires AVX512F.
func M512MaskzCvtepi64Epi32 ¶
M512MaskzCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVQD'. Intrinsic: '_mm512_maskz_cvtepi64_epi32'. Requires AVX512F.
func M512MaskzCvtepi64Epi8 ¶
M512MaskzCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQB'. Intrinsic: '_mm512_maskz_cvtepi64_epi8'. Requires AVX512F.
func M512MaskzCvtepi8Epi32 ¶
M512MaskzCvtepi8Epi32: Sign extend packed 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXBD'. Intrinsic: '_mm512_maskz_cvtepi8_epi32'. Requires AVX512F.
func M512MaskzCvtepi8Epi64 ¶
M512MaskzCvtepi8Epi64: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm512_maskz_cvtepi8_epi64'. Requires AVX512F.
func M512MaskzCvtepu16Epi32 ¶
M512MaskzCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXWD'. Intrinsic: '_mm512_maskz_cvtepu16_epi32'. Requires AVX512F.
func M512MaskzCvtepu16Epi64 ¶
M512MaskzCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm512_maskz_cvtepu16_epi64'. Requires AVX512F.
func M512MaskzCvtepu32Epi64 ¶
M512MaskzCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm512_maskz_cvtepu32_epi64'. Requires AVX512F.
func M512MaskzCvtepu32Pd ¶
M512MaskzCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_maskz_cvtepu32_pd'. Requires AVX512F.
func M512MaskzCvtepu32Ps ¶
M512MaskzCvtepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_maskz_cvtepu32_ps'. Requires AVX512F.
func M512MaskzCvtepu8Epi32 ¶
M512MaskzCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXBD'. Intrinsic: '_mm512_maskz_cvtepu8_epi32'. Requires AVX512F.
func M512MaskzCvtepu8Epi64 ¶
M512MaskzCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm512_maskz_cvtepu8_epi64'. Requires AVX512F.
func M512MaskzCvtpdEpi32 ¶
M512MaskzCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_maskz_cvtpd_epi32'. Requires AVX512F.
func M512MaskzCvtpdEpu32 ¶
M512MaskzCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_maskz_cvtpd_epu32'. Requires AVX512F.
func M512MaskzCvtpdPs ¶
M512MaskzCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_maskz_cvtpd_ps'. Requires AVX512F.
func M512MaskzCvtphPs ¶
M512MaskzCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_maskz_cvtph_ps'. Requires AVX512F.
func M512MaskzCvtpsEpi32 ¶
M512MaskzCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_maskz_cvtps_epi32'. Requires AVX512F.
func M512MaskzCvtpsEpu32 ¶
M512MaskzCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_maskz_cvtps_epu32'. Requires AVX512F.
func M512MaskzCvtpsPd ¶
M512MaskzCvtpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_maskz_cvtps_pd'. Requires AVX512F.
func M512MaskzCvtpsPh ¶
M512MaskzCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_maskz_cvtps_ph'. Requires AVX512F.
func M512MaskzCvtsepi32Epi16 ¶
M512MaskzCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSDW'. Intrinsic: '_mm512_maskz_cvtsepi32_epi16'. Requires AVX512F.
func M512MaskzCvtsepi32Epi8 ¶
M512MaskzCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSDB'. Intrinsic: '_mm512_maskz_cvtsepi32_epi8'. Requires AVX512F.
func M512MaskzCvtsepi64Epi16 ¶
M512MaskzCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSQW'. Intrinsic: '_mm512_maskz_cvtsepi64_epi16'. Requires AVX512F.
func M512MaskzCvtsepi64Epi32 ¶
M512MaskzCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVSQD'. Intrinsic: '_mm512_maskz_cvtsepi64_epi32'. Requires AVX512F.
func M512MaskzCvtsepi64Epi8 ¶
M512MaskzCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSQB'. Intrinsic: '_mm512_maskz_cvtsepi64_epi8'. Requires AVX512F.
func M512MaskzCvttRoundpdEpi32 ¶
M512MaskzCvttRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := 32*i l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_maskz_cvtt_roundpd_epi32'. Requires AVX512F.
func M512MaskzCvttRoundpdEpu32 ¶
M512MaskzCvttRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := 32*i l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_maskz_cvtt_roundpd_epu32'. Requires AVX512F.
func M512MaskzCvttRoundpsEpi32 ¶
M512MaskzCvttRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_maskz_cvtt_roundps_epi32'. Requires AVX512F.
func M512MaskzCvttRoundpsEpu32 ¶
M512MaskzCvttRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_maskz_cvtt_roundps_epu32'. Requires AVX512F.
func M512MaskzCvttpdEpi32 ¶
M512MaskzCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_maskz_cvttpd_epi32'. Requires AVX512F.
func M512MaskzCvttpdEpu32 ¶
M512MaskzCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_maskz_cvttpd_epu32'. Requires AVX512F.
func M512MaskzCvttpsEpi32 ¶
M512MaskzCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_maskz_cvttps_epi32'. Requires AVX512F.
func M512MaskzCvttpsEpu32 ¶
M512MaskzCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_maskz_cvttps_epu32'. Requires AVX512F.
func M512MaskzCvtusepi32Epi16 ¶
M512MaskzCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVUSDW'. Intrinsic: '_mm512_maskz_cvtusepi32_epi16'. Requires AVX512F.
func M512MaskzCvtusepi32Epi8 ¶
M512MaskzCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSDB'. Intrinsic: '_mm512_maskz_cvtusepi32_epi8'. Requires AVX512F.
func M512MaskzCvtusepi64Epi16 ¶
M512MaskzCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVUSQW'. Intrinsic: '_mm512_maskz_cvtusepi64_epi16'. Requires AVX512F.
func M512MaskzCvtusepi64Epi32 ¶
M512MaskzCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VPMOVUSQD'. Intrinsic: '_mm512_maskz_cvtusepi64_epi32'. Requires AVX512F.
func M512MaskzCvtusepi64Epi8 ¶
M512MaskzCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSQB'. Intrinsic: '_mm512_maskz_cvtusepi64_epi8'. Requires AVX512F.
func M512MaskzDivPd ¶
M512MaskzDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPD'. Intrinsic: '_mm512_maskz_div_pd'. Requires AVX512F.
func M512MaskzDivPs ¶
M512MaskzDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPS'. Intrinsic: '_mm512_maskz_div_ps'. Requires AVX512F.
func M512MaskzDivRoundPd ¶
M512MaskzDivRoundPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPD'. Intrinsic: '_mm512_maskz_div_round_pd'. Requires AVX512F.
func M512MaskzDivRoundPs ¶
M512MaskzDivRoundPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VDIVPS'. Intrinsic: '_mm512_maskz_div_round_ps'. Requires AVX512F.
func M512MaskzExpandEpi32 ¶
M512MaskzExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPEXPANDD'. Intrinsic: '_mm512_maskz_expand_epi32'. Requires AVX512F.
func M512MaskzExpandEpi64 ¶
M512MaskzExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPEXPANDQ'. Intrinsic: '_mm512_maskz_expand_epi64'. Requires AVX512F.
func M512MaskzExpandPd ¶
M512MaskzExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VEXPANDPD'. Intrinsic: '_mm512_maskz_expand_pd'. Requires AVX512F.
func M512MaskzExpandPs ¶
M512MaskzExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VEXPANDPS'. Intrinsic: '_mm512_maskz_expand_ps'. Requires AVX512F.
func M512MaskzExtractf32x4Ps ¶
M512MaskzExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: tmp[127:0] := a[127:0] 1: tmp[127:0] := a[255:128] 2: tmp[127:0] := a[383:256] 3: tmp[127:0] := a[511:384] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm512_maskz_extractf32x4_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzExtractf64x4Pd ¶
M512MaskzExtractf64x4Pd: Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: tmp[255:0] := a[255:0] 1: tmp[255:0] := a[511:256] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VEXTRACTF64X4'. Intrinsic: '_mm512_maskz_extractf64x4_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzExtracti32x4Epi32 ¶
M512MaskzExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: dst[127:0] := a[127:0] 1: dst[127:0] := a[255:128] 2: dst[127:0] := a[383:256] 3: dst[127:0] := a[511:384] ESAC FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm512_maskz_extracti32x4_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzExtracti64x4Epi64 ¶
M512MaskzExtracti64x4Epi64: Extract 256 bits (composed of 4 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE imm8[7:0] of 0: dst[255:0] := a[255:0] 1: dst[255:0] := a[511:256] ESAC FOR j := 0 to 3 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:256] := 0
Instruction: 'VEXTRACTI64X4'. Intrinsic: '_mm512_maskz_extracti64x4_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzFixupimmPd ¶
func M512MaskzFixupimmPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)
M512MaskzFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_maskz_fixupimm_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzFixupimmPs ¶
func M512MaskzFixupimmPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)
M512MaskzFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_maskz_fixupimm_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzFixupimmRoundPd ¶
func M512MaskzFixupimmRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)
M512MaskzFixupimmRoundPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_maskz_fixupimm_round_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzFixupimmRoundPs ¶
func M512MaskzFixupimmRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)
M512MaskzFixupimmRoundPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_maskz_fixupimm_round_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzFmaddPd ¶
M512MaskzFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_maskz_fmadd_pd'. Requires AVX512F.
func M512MaskzFmaddPs ¶
M512MaskzFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_maskz_fmadd_ps'. Requires AVX512F.
func M512MaskzFmaddRoundPd ¶
func M512MaskzFmaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
M512MaskzFmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_maskz_fmadd_round_pd'. Requires AVX512F.
func M512MaskzFmaddRoundPs ¶
func M512MaskzFmaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
M512MaskzFmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'a' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] a[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_maskz_fmadd_round_ps'. Requires AVX512F.
func M512MaskzFmaddsubPd ¶
M512MaskzFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_maskz_fmaddsub_pd'. Requires AVX512F.
func M512MaskzFmaddsubPs ¶
M512MaskzFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_maskz_fmaddsub_ps'. Requires AVX512F.
func M512MaskzFmaddsubRoundPd ¶
func M512MaskzFmaddsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
M512MaskzFmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_maskz_fmaddsub_round_pd'. Requires AVX512F.
func M512MaskzFmaddsubRoundPs ¶
func M512MaskzFmaddsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
M512MaskzFmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_maskz_fmaddsub_round_ps'. Requires AVX512F.
func M512MaskzFmsubPd ¶
M512MaskzFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_maskz_fmsub_pd'. Requires AVX512F.
func M512MaskzFmsubPs ¶
M512MaskzFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_maskz_fmsub_ps'. Requires AVX512F.
func M512MaskzFmsubRoundPd ¶
func M512MaskzFmsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
M512MaskzFmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_maskz_fmsub_round_pd'. Requires AVX512F.
func M512MaskzFmsubRoundPs ¶
func M512MaskzFmsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
M512MaskzFmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_maskz_fmsub_round_ps'. Requires AVX512F.
func M512MaskzFmsubaddPd ¶
M512MaskzFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_maskz_fmsubadd_pd'. Requires AVX512F.
func M512MaskzFmsubaddPs ¶
M512MaskzFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_maskz_fmsubadd_ps'. Requires AVX512F.
func M512MaskzFmsubaddRoundPd ¶
func M512MaskzFmsubaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
M512MaskzFmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_maskz_fmsubadd_round_pd'. Requires AVX512F.
func M512MaskzFmsubaddRoundPs ¶
func M512MaskzFmsubaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
M512MaskzFmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_maskz_fmsubadd_round_ps'. Requires AVX512F.
func M512MaskzFnmaddPd ¶
M512MaskzFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_maskz_fnmadd_pd'. Requires AVX512F.
func M512MaskzFnmaddPs ¶
M512MaskzFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_maskz_fnmadd_ps'. Requires AVX512F.
func M512MaskzFnmaddRoundPd ¶
func M512MaskzFnmaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
M512MaskzFnmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_maskz_fnmadd_round_pd'. Requires AVX512F.
func M512MaskzFnmaddRoundPs ¶
func M512MaskzFnmaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
M512MaskzFnmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_maskz_fnmadd_round_ps'. Requires AVX512F.
func M512MaskzFnmsubPd ¶
M512MaskzFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_maskz_fnmsub_pd'. Requires AVX512F.
func M512MaskzFnmsubPs ¶
M512MaskzFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_maskz_fnmsub_ps'. Requires AVX512F.
func M512MaskzFnmsubRoundPd ¶
func M512MaskzFnmsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
M512MaskzFnmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_maskz_fnmsub_round_pd'. Requires AVX512F.
func M512MaskzFnmsubRoundPs ¶
func M512MaskzFnmsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
M512MaskzFnmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_maskz_fnmsub_round_ps'. Requires AVX512F.
func M512MaskzGetexpPd ¶
M512MaskzGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VGETEXPPD'. Intrinsic: '_mm512_maskz_getexp_pd'. Requires AVX512F.
func M512MaskzGetexpPs ¶
M512MaskzGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VGETEXPPS'. Intrinsic: '_mm512_maskz_getexp_ps'. Requires AVX512F.
func M512MaskzGetexpRoundPd ¶
M512MaskzGetexpRoundPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VGETEXPPD'. Intrinsic: '_mm512_maskz_getexp_round_pd'. Requires AVX512F.
func M512MaskzGetexpRoundPs ¶
M512MaskzGetexpRoundPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VGETEXPPS'. Intrinsic: '_mm512_maskz_getexp_round_ps'. Requires AVX512F.
func M512MaskzGetmantPd ¶
func M512MaskzGetmantPd(k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512d)
M512MaskzGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VGETMANTPD'. Intrinsic: '_mm512_maskz_getmant_pd'. Requires AVX512F.
func M512MaskzGetmantPs ¶
func M512MaskzGetmantPs(k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512)
M512MaskzGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VGETMANTPS'. Intrinsic: '_mm512_maskz_getmant_ps'. Requires AVX512F.
func M512MaskzGetmantRoundPd ¶
func M512MaskzGetmantRoundPd(k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512d)
M512MaskzGetmantRoundPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of: (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VGETMANTPD'. Intrinsic: '_mm512_maskz_getmant_round_pd'. Requires AVX512F.
func M512MaskzGetmantRoundPs ¶
func M512MaskzGetmantRoundPs(k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512)
M512MaskzGetmantRoundPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of: (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VGETMANTPS'. Intrinsic: '_mm512_maskz_getmant_round_ps'. Requires AVX512F.
func M512MaskzInsertf32x4 ¶
M512MaskzInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VINSERTF32X4'. Intrinsic: '_mm512_maskz_insertf32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzInsertf64x4 ¶
M512MaskzInsertf64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[511:0] := a[511:0] CASE (imm8[0]) of 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VINSERTF64X4'. Intrinsic: '_mm512_maskz_insertf64x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzInserti32x4 ¶
M512MaskzInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[511:0] := a[511:0] CASE (imm8[1:0]) of 0: tmp[127:0] := b[127:0] 1: tmp[255:128] := b[127:0] 2: tmp[383:256] := b[127:0] 3: tmp[511:384] := b[127:0] ESAC FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VINSERTI32X4'. Intrinsic: '_mm512_maskz_inserti32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzInserti64x4 ¶
M512MaskzInserti64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[511:0] := a[511:0] CASE (imm8[0]) of 0: tmp[255:0] := b[255:0] 1: tmp[511:256] := b[255:0] ESAC FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VINSERTI64X4'. Intrinsic: '_mm512_maskz_inserti64x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzMaxEpi32 ¶
M512MaskzMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMAXSD'. Intrinsic: '_mm512_maskz_max_epi32'. Requires AVX512F.
func M512MaskzMaxEpi64 ¶
M512MaskzMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMAXSQ'. Intrinsic: '_mm512_maskz_max_epi64'. Requires AVX512F.
func M512MaskzMaxEpu32 ¶
M512MaskzMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMAXUD'. Intrinsic: '_mm512_maskz_max_epu32'. Requires AVX512F.
func M512MaskzMaxEpu64 ¶
M512MaskzMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMAXUQ'. Intrinsic: '_mm512_maskz_max_epu64'. Requires AVX512F.
func M512MaskzMaxPd ¶
M512MaskzMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPD'. Intrinsic: '_mm512_maskz_max_pd'. Requires AVX512F.
func M512MaskzMaxPs ¶
M512MaskzMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPS'. Intrinsic: '_mm512_maskz_max_ps'. Requires AVX512F.
func M512MaskzMaxRoundPd ¶
M512MaskzMaxRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPD'. Intrinsic: '_mm512_maskz_max_round_pd'. Requires AVX512F.
func M512MaskzMaxRoundPs ¶
M512MaskzMaxRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPS'. Intrinsic: '_mm512_maskz_max_round_ps'. Requires AVX512F.
func M512MaskzMinEpi32 ¶
M512MaskzMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMINSD'. Intrinsic: '_mm512_maskz_min_epi32'. Requires AVX512F.
func M512MaskzMinEpi64 ¶
M512MaskzMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMINSQ'. Intrinsic: '_mm512_maskz_min_epi64'. Requires AVX512F.
func M512MaskzMinEpu32 ¶
M512MaskzMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMINUD'. Intrinsic: '_mm512_maskz_min_epu32'. Requires AVX512F.
func M512MaskzMinEpu64 ¶
M512MaskzMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMINUQ'. Intrinsic: '_mm512_maskz_min_epu64'. Requires AVX512F.
func M512MaskzMinPd ¶
M512MaskzMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPD'. Intrinsic: '_mm512_maskz_min_pd'. Requires AVX512F.
func M512MaskzMinPs ¶
M512MaskzMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPS'. Intrinsic: '_mm512_maskz_min_ps'. Requires AVX512F.
func M512MaskzMinRoundPd ¶
M512MaskzMinRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPD'. Intrinsic: '_mm512_maskz_min_round_pd'. Requires AVX512F.
func M512MaskzMinRoundPs ¶
M512MaskzMinRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPS'. Intrinsic: '_mm512_maskz_min_round_ps'. Requires AVX512F.
func M512MaskzMovEpi32 ¶
M512MaskzMovEpi32: Move packed 32-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMOVDQA32'. Intrinsic: '_mm512_maskz_mov_epi32'. Requires AVX512F.
func M512MaskzMovEpi64 ¶
M512MaskzMovEpi64: Move packed 64-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMOVDQA64'. Intrinsic: '_mm512_maskz_mov_epi64'. Requires AVX512F.
func M512MaskzMovPd ¶
M512MaskzMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMOVAPD'. Intrinsic: '_mm512_maskz_mov_pd'. Requires AVX512F.
func M512MaskzMovPs ¶
M512MaskzMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMOVAPS'. Intrinsic: '_mm512_maskz_mov_ps'. Requires AVX512F.
func M512MaskzMovedupPd ¶
M512MaskzMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] tmp[191:128] := a[191:128] tmp[255:192] := a[191:128] tmp[319:256] := a[319:256] tmp[383:320] := a[319:256] tmp[447:384] := a[447:384] tmp[511:448] := a[447:384] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMOVDDUP'. Intrinsic: '_mm512_maskz_movedup_pd'. Requires AVX512F.
func M512MaskzMovehdupPs ¶
M512MaskzMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] tmp[159:128] := a[191:160] tmp[191:160] := a[191:160] tmp[223:192] := a[255:224] tmp[255:224] := a[255:224] tmp[287:256] := a[319:288] tmp[319:288] := a[319:288] tmp[351:320] := a[383:352] tmp[383:352] := a[383:352] tmp[415:384] := a[447:416] tmp[447:416] := a[447:416] tmp[479:448] := a[511:480] tmp[511:480] := a[511:480] FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMOVSHDUP'. Intrinsic: '_mm512_maskz_movehdup_ps'. Requires AVX512F.
func M512MaskzMoveldupPs ¶
M512MaskzMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] tmp[159:128] := a[159:128] tmp[191:160] := a[159:128] tmp[223:192] := a[223:192] tmp[255:224] := a[223:192] tmp[287:256] := a[287:256] tmp[319:288] := a[287:256] tmp[351:320] := a[351:320] tmp[383:352] := a[351:320] tmp[415:384] := a[415:384] tmp[447:416] := a[415:384] tmp[479:448] := a[479:448] tmp[511:480] := a[479:448] FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMOVSLDUP'. Intrinsic: '_mm512_maskz_moveldup_ps'. Requires AVX512F.
func M512MaskzMulEpi32 ¶
M512MaskzMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMULDQ'. Intrinsic: '_mm512_maskz_mul_epi32'. Requires AVX512F.
func M512MaskzMulEpu32 ¶
M512MaskzMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMULUDQ'. Intrinsic: '_mm512_maskz_mul_epu32'. Requires AVX512F.
func M512MaskzMulPd ¶
M512MaskzMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMULPD'. Intrinsic: '_mm512_maskz_mul_pd'. Requires AVX512F.
func M512MaskzMulPs ¶
M512MaskzMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMULPS'. Intrinsic: '_mm512_maskz_mul_ps'. Requires AVX512F.
func M512MaskzMulRoundPd ¶
M512MaskzMulRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMULPD'. Intrinsic: '_mm512_maskz_mul_round_pd'. Requires AVX512F.
func M512MaskzMulRoundPs ¶
M512MaskzMulRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VMULPS'. Intrinsic: '_mm512_maskz_mul_round_ps'. Requires AVX512F.
func M512MaskzMulloEpi32 ¶
M512MaskzMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMULLD'. Intrinsic: '_mm512_maskz_mullo_epi32'. Requires AVX512F.
func M512MaskzOrEpi32 ¶
M512MaskzOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPORD'. Intrinsic: '_mm512_maskz_or_epi32'. Requires AVX512F.
func M512MaskzOrEpi64 ¶
M512MaskzOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPORQ'. Intrinsic: '_mm512_maskz_or_epi64'. Requires AVX512F.
func M512MaskzPermutePd ¶
M512MaskzPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128] IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192] IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128] IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192] IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256] IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320] IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256] IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320] IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384] IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448] IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384] IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm512_maskz_permute_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzPermutePs ¶
M512MaskzPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm512_maskz_permute_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzPermutevarPd ¶
M512MaskzPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] IF (b[129] == 0) tmp_dst[191:128] := a[191:128] IF (b[129] == 1) tmp_dst[191:128] := a[255:192] IF (b[193] == 0) tmp_dst[255:192] := a[191:128] IF (b[193] == 1) tmp_dst[255:192] := a[255:192] IF (b[257] == 0) tmp_dst[319:256] := a[319:256] IF (b[257] == 1) tmp_dst[319:256] := a[383:320] IF (b[321] == 0) tmp_dst[383:320] := a[319:256] IF (b[321] == 1) tmp_dst[383:320] := a[383:320] IF (b[385] == 0) tmp_dst[447:384] := a[447:384] IF (b[385] == 1) tmp_dst[447:384] := a[511:448] IF (b[449] == 0) tmp_dst[511:448] := a[447:384] IF (b[449] == 1) tmp_dst[511:448] := a[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm512_maskz_permutevar_pd'. Requires AVX512F.
func M512MaskzPermutevarPs ¶
M512MaskzPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm512_maskz_permutevar_ps'. Requires AVX512F.
func M512MaskzPermutex2varEpi32 ¶
func M512MaskzPermutex2varEpi32(k x86.Mmask16, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
M512MaskzPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm512_maskz_permutex2var_epi32'. Requires AVX512F.
func M512MaskzPermutex2varEpi64 ¶
func M512MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
M512MaskzPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm512_maskz_permutex2var_epi64'. Requires AVX512F.
func M512MaskzPermutex2varPd ¶
M512MaskzPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 IF k[j] dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm512_maskz_permutex2var_pd'. Requires AVX512F.
func M512MaskzPermutex2varPs ¶
M512MaskzPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 IF k[j] dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm512_maskz_permutex2var_ps'. Requires AVX512F.
func M512MaskzPermutexEpi64 ¶
M512MaskzPermutexEpi64: Shuffle 64-bit integers in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm512_maskz_permutex_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzPermutexPd ¶
M512MaskzPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm512_maskz_permutex_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzPermutexvarEpi32 ¶
M512MaskzPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMD'. Intrinsic: '_mm512_maskz_permutexvar_epi32'. Requires AVX512F.
func M512MaskzPermutexvarEpi64 ¶
M512MaskzPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm512_maskz_permutexvar_epi64'. Requires AVX512F.
func M512MaskzPermutexvarPd ¶
M512MaskzPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 IF k[j] dst[i+63:i] := a[id+63:id] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm512_maskz_permutexvar_pd'. Requires AVX512F.
func M512MaskzPermutexvarPs ¶
M512MaskzPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 IF k[j] dst[i+31:i] := a[id+31:id] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMPS'. Intrinsic: '_mm512_maskz_permutexvar_ps'. Requires AVX512F.
func M512MaskzRcp14Pd ¶
M512MaskzRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRCP14PD'. Intrinsic: '_mm512_maskz_rcp14_pd'. Requires AVX512F.
func M512MaskzRcp14Ps ¶
M512MaskzRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRCP14PS'. Intrinsic: '_mm512_maskz_rcp14_ps'. Requires AVX512F.
func M512MaskzRolEpi32 ¶
M512MaskzRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLD'. Intrinsic: '_mm512_maskz_rol_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzRolEpi64 ¶
M512MaskzRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLQ'. Intrinsic: '_mm512_maskz_rol_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzRolvEpi32 ¶
M512MaskzRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLVD'. Intrinsic: '_mm512_maskz_rolv_epi32'. Requires AVX512F.
func M512MaskzRolvEpi64 ¶
M512MaskzRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLVQ'. Intrinsic: '_mm512_maskz_rolv_epi64'. Requires AVX512F.
func M512MaskzRorEpi32 ¶
M512MaskzRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORD'. Intrinsic: '_mm512_maskz_ror_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzRorEpi64 ¶
M512MaskzRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORQ'. Intrinsic: '_mm512_maskz_ror_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzRorvEpi32 ¶
M512MaskzRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORVD'. Intrinsic: '_mm512_maskz_rorv_epi32'. Requires AVX512F.
func M512MaskzRorvEpi64 ¶
M512MaskzRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORVQ'. Intrinsic: '_mm512_maskz_rorv_epi64'. Requires AVX512F.
func M512MaskzRoundscalePd ¶
M512MaskzRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_maskz_roundscale_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzRoundscalePs ¶
M512MaskzRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_maskz_roundscale_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzRoundscaleRoundPd ¶
M512MaskzRoundscaleRoundPd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_maskz_roundscale_round_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzRoundscaleRoundPs ¶
M512MaskzRoundscaleRoundPs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_maskz_roundscale_round_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzRsqrt14Pd ¶
M512MaskzRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRSQRT14PD'. Intrinsic: '_mm512_maskz_rsqrt14_pd'. Requires AVX512F.
func M512MaskzRsqrt14Ps ¶
M512MaskzRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VRSQRT14PS'. Intrinsic: '_mm512_maskz_rsqrt14_ps'. Requires AVX512F.
func M512MaskzScalefPd ¶
M512MaskzScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_maskz_scalef_pd'. Requires AVX512F.
func M512MaskzScalefPs ¶
M512MaskzScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_maskz_scalef_ps'. Requires AVX512F.
func M512MaskzScalefRoundPd ¶
M512MaskzScalefRoundPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_maskz_scalef_round_pd'. Requires AVX512F.
func M512MaskzScalefRoundPs ¶
M512MaskzScalefRoundPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_maskz_scalef_round_ps'. Requires AVX512F.
func M512MaskzSet1Epi32 ¶
M512MaskzSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_maskz_set1_epi32'. Requires AVX512F.
func M512MaskzSet1Epi64 ¶
M512MaskzSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_maskz_set1_epi64'. Requires AVX512F.
func M512MaskzShuffleEpi32 ¶
M512MaskzShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSHUFD'. Intrinsic: '_mm512_maskz_shuffle_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzShuffleF32x4 ¶
M512MaskzShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFF32X4'. Intrinsic: '_mm512_maskz_shuffle_f32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzShuffleF64x2 ¶
M512MaskzShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFF64X2'. Intrinsic: '_mm512_maskz_shuffle_f64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzShuffleI32x4 ¶
M512MaskzShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFI32X4'. Intrinsic: '_mm512_maskz_shuffle_i32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzShuffleI64x2 ¶
M512MaskzShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFI64X2'. Intrinsic: '_mm512_maskz_shuffle_i64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzShufflePd ¶
M512MaskzShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFPD'. Intrinsic: '_mm512_maskz_shuffle_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzShufflePs ¶
M512MaskzShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSHUFPS'. Intrinsic: '_mm512_maskz_shuffle_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzSllEpi32 ¶
M512MaskzSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm512_maskz_sll_epi32'. Requires AVX512F.
func M512MaskzSllEpi64 ¶
M512MaskzSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm512_maskz_sll_epi64'. Requires AVX512F.
func M512MaskzSlliEpi32 ¶
M512MaskzSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm512_maskz_slli_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzSlliEpi64 ¶
M512MaskzSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm512_maskz_slli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzSllvEpi32 ¶
M512MaskzSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLVD'. Intrinsic: '_mm512_maskz_sllv_epi32'. Requires AVX512F.
func M512MaskzSllvEpi64 ¶
M512MaskzSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLVQ'. Intrinsic: '_mm512_maskz_sllv_epi64'. Requires AVX512F.
func M512MaskzSqrtPd ¶
M512MaskzSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPD'. Intrinsic: '_mm512_maskz_sqrt_pd'. Requires AVX512F.
func M512MaskzSqrtPs ¶
M512MaskzSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPS'. Intrinsic: '_mm512_maskz_sqrt_ps'. Requires AVX512F.
func M512MaskzSqrtRoundPd ¶
M512MaskzSqrtRoundPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE. FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPD'. Intrinsic: '_mm512_maskz_sqrt_round_pd'. Requires AVX512F.
func M512MaskzSqrtRoundPs ¶
M512MaskzSqrtRoundPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPS'. Intrinsic: '_mm512_maskz_sqrt_round_ps'. Requires AVX512F.
func M512MaskzSraEpi32 ¶
M512MaskzSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm512_maskz_sra_epi32'. Requires AVX512F.
func M512MaskzSraEpi64 ¶
M512MaskzSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm512_maskz_sra_epi64'. Requires AVX512F.
func M512MaskzSraiEpi32 ¶
M512MaskzSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm512_maskz_srai_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzSraiEpi64 ¶
M512MaskzSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm512_maskz_srai_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzSravEpi32 ¶
M512MaskzSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAVD'. Intrinsic: '_mm512_maskz_srav_epi32'. Requires AVX512F.
func M512MaskzSravEpi64 ¶
M512MaskzSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAVQ'. Intrinsic: '_mm512_maskz_srav_epi64'. Requires AVX512F.
func M512MaskzSrlEpi32 ¶
M512MaskzSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm512_maskz_srl_epi32'. Requires AVX512F.
func M512MaskzSrlEpi64 ¶
M512MaskzSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm512_maskz_srl_epi64'. Requires AVX512F.
func M512MaskzSrliEpi32 ¶
M512MaskzSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm512_maskz_srli_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzSrliEpi64 ¶
M512MaskzSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm512_maskz_srli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzSrlvEpi32 ¶
M512MaskzSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLVD'. Intrinsic: '_mm512_maskz_srlv_epi32'. Requires AVX512F.
func M512MaskzSrlvEpi64 ¶
M512MaskzSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLVQ'. Intrinsic: '_mm512_maskz_srlv_epi64'. Requires AVX512F.
func M512MaskzSubEpi32 ¶
M512MaskzSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSUBD'. Intrinsic: '_mm512_maskz_sub_epi32'. Requires AVX512F.
func M512MaskzSubEpi64 ¶
M512MaskzSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSUBQ'. Intrinsic: '_mm512_maskz_sub_epi64'. Requires AVX512F.
func M512MaskzSubPd ¶
M512MaskzSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSUBPD'. Intrinsic: '_mm512_maskz_sub_pd'. Requires AVX512F.
func M512MaskzSubPs ¶
M512MaskzSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSUBPS'. Intrinsic: '_mm512_maskz_sub_ps'. Requires AVX512F.
func M512MaskzSubRoundPd ¶
M512MaskzSubRoundPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSUBPD'. Intrinsic: '_mm512_maskz_sub_round_pd'. Requires AVX512F.
func M512MaskzSubRoundPs ¶
M512MaskzSubRoundPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VSUBPS'. Intrinsic: '_mm512_maskz_sub_round_ps'. Requires AVX512F.
func M512MaskzTernarylogicEpi32 ¶
func M512MaskzTernarylogicEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)
M512MaskzTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPTERNLOGD'. Intrinsic: '_mm512_maskz_ternarylogic_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzTernarylogicEpi64 ¶
func M512MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)
M512MaskzTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm512_maskz_ternarylogic_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512MaskzUnpackhiEpi32 ¶
M512MaskzUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm512_maskz_unpackhi_epi32'. Requires AVX512F.
func M512MaskzUnpackhiEpi64 ¶
M512MaskzUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm512_maskz_unpackhi_epi64'. Requires AVX512F.
func M512MaskzUnpackhiPd ¶
M512MaskzUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VUNPCKHPD'. Intrinsic: '_mm512_maskz_unpackhi_pd'. Requires AVX512F.
func M512MaskzUnpackhiPs ¶
M512MaskzUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VUNPCKHPS'. Intrinsic: '_mm512_maskz_unpackhi_ps'. Requires AVX512F.
func M512MaskzUnpackloEpi32 ¶
M512MaskzUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm512_maskz_unpacklo_epi32'. Requires AVX512F.
func M512MaskzUnpackloEpi64 ¶
M512MaskzUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm512_maskz_unpacklo_epi64'. Requires AVX512F.
func M512MaskzUnpackloPd ¶
M512MaskzUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VUNPCKLPD'. Intrinsic: '_mm512_maskz_unpacklo_pd'. Requires AVX512F.
func M512MaskzUnpackloPs ¶
M512MaskzUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VUNPCKLPS'. Intrinsic: '_mm512_maskz_unpacklo_ps'. Requires AVX512F.
func M512MaskzXorEpi32 ¶
M512MaskzXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 15 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPXORD'. Intrinsic: '_mm512_maskz_xor_epi32'. Requires AVX512F.
func M512MaskzXorEpi64 ¶
M512MaskzXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 7 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPXORQ'. Intrinsic: '_mm512_maskz_xor_epi64'. Requires AVX512F.
func M512MaxEpi64 ¶
M512MaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.
FOR j := 0 to 7 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMAXSQ'. Intrinsic: '_mm512_max_epi64'. Requires AVX512F.
func M512MaxEpu64 ¶
M512MaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.
FOR j := 0 to 7 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMAXUQ'. Intrinsic: '_mm512_max_epu64'. Requires AVX512F.
func M512MaxPd ¶
M512MaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPD'. Intrinsic: '_mm512_max_pd'. Requires AVX512F.
func M512MaxPs ¶
M512MaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPS'. Intrinsic: '_mm512_max_ps'. Requires AVX512F.
func M512MaxRoundPd ¶
M512MaxRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := j*64 dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPD'. Intrinsic: '_mm512_max_round_pd'. Requires AVX512F.
func M512MaxRoundPs ¶
M512MaxRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := j*32 dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VMAXPS'. Intrinsic: '_mm512_max_round_ps'. Requires AVX512F.
func M512MinEpi64 ¶
M512MinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.
FOR j := 0 to 7 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMINSQ'. Intrinsic: '_mm512_min_epi64'. Requires AVX512F.
func M512MinEpu64 ¶
M512MinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.
FOR j := 0 to 7 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPMINUQ'. Intrinsic: '_mm512_min_epu64'. Requires AVX512F.
func M512MinPd ¶
M512MinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPD'. Intrinsic: '_mm512_min_pd'. Requires AVX512F.
func M512MinPs ¶
M512MinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPS'. Intrinsic: '_mm512_min_ps'. Requires AVX512F.
func M512MinRoundPd ¶
M512MinRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 7 i := j*64 dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPD'. Intrinsic: '_mm512_min_round_pd'. Requires AVX512F.
func M512MinRoundPs ¶
M512MinRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. FOR j := 0 to 15 i := j*32 dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VMINPS'. Intrinsic: '_mm512_min_round_ps'. Requires AVX512F.
func M512MovedupPd ¶
M512MovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst'.
tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] tmp[191:128] := a[191:128] tmp[255:192] := a[191:128] tmp[319:256] := a[319:256] tmp[383:320] := a[319:256] tmp[447:384] := a[447:384] tmp[511:448] := a[447:384] dst[MAX:512] := 0
Instruction: 'VMOVDDUP'. Intrinsic: '_mm512_movedup_pd'. Requires AVX512F.
func M512MovehdupPs ¶
M512MovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst'.
dst[31:0] := a[63:32] dst[63:32] := a[63:32] dst[95:64] := a[127:96] dst[127:96] := a[127:96] dst[159:128] := a[191:160] dst[191:160] := a[191:160] dst[223:192] := a[255:224] dst[255:224] := a[255:224] dst[287:256] := a[319:288] dst[319:288] := a[319:288] dst[351:320] := a[383:352] dst[383:352] := a[383:352] dst[415:384] := a[447:416] dst[447:416] := a[447:416] dst[479:448] := a[511:480] dst[511:480] := a[511:480] dst[MAX:512] := 0
Instruction: 'VMOVSHDUP'. Intrinsic: '_mm512_movehdup_ps'. Requires AVX512F.
func M512MoveldupPs ¶
M512MoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst'.
dst[31:0] := a[31:0] dst[63:32] := a[31:0] dst[95:64] := a[95:64] dst[127:96] := a[95:64] dst[159:128] := a[159:128] dst[191:160] := a[159:128] dst[223:192] := a[223:192] dst[255:224] := a[223:192] dst[287:256] := a[287:256] dst[319:288] := a[287:256] dst[351:320] := a[351:320] dst[383:352] := a[351:320] dst[415:384] := a[415:384] dst[447:416] := a[415:384] dst[479:448] := a[479:448] dst[511:480] := a[479:448] dst[MAX:512] := 0
Instruction: 'VMOVSLDUP'. Intrinsic: '_mm512_moveldup_ps'. Requires AVX512F.
func M512MulEpi32 ¶
M512MulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:512] := 0
Instruction: 'VPMULDQ'. Intrinsic: '_mm512_mul_epi32'. Requires AVX512F.
func M512MulEpu32 ¶
M512MulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+31:i] * b[i+31:i] ENDFOR dst[MAX:512] := 0
Instruction: 'VPMULUDQ'. Intrinsic: '_mm512_mul_epu32'. Requires AVX512F.
func M512MulloxEpi64 ¶
M512MulloxEpi64: Multiplies elements in packed 64-bit integer vectors 'a' and 'b' together, storing the lower 64 bits of the result in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] * b[i+63:i] ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_mullox_epi64'. Requires AVX512F.
func M512NearbyintPd ¶
M512NearbyintPd: Rounds each packed double-precision (64-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := NearbyInt(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_nearbyint_pd'. Requires AVX512F.
func M512NearbyintPs ¶
M512NearbyintPs: Rounds each packed single-precision (32-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := NearbyInt(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_nearbyint_ps'. Requires AVX512F.
func M512PermutePd ¶
M512PermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.
IF (imm8[0] == 0) dst[63:0] := a[63:0] IF (imm8[0] == 1) dst[63:0] := a[127:64] IF (imm8[1] == 0) dst[127:64] := a[63:0] IF (imm8[1] == 1) dst[127:64] := a[127:64] IF (imm8[2] == 0) dst[191:128] := a[191:128] IF (imm8[2] == 1) dst[191:128] := a[255:192] IF (imm8[3] == 0) dst[255:192] := a[191:128] IF (imm8[3] == 1) dst[255:192] := a[255:192] IF (imm8[4] == 0) dst[319:256] := a[319:256] IF (imm8[4] == 1) dst[319:256] := a[383:320] IF (imm8[5] == 0) dst[383:320] := a[319:256] IF (imm8[5] == 1) dst[383:320] := a[383:320] IF (imm8[6] == 0) dst[447:384] := a[447:384] IF (imm8[6] == 1) dst[447:384] := a[511:448] IF (imm8[7] == 0) dst[511:448] := a[447:384] IF (imm8[7] == 1) dst[511:448] := a[511:448] dst[MAX:512] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm512_permute_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512PermutePs ¶
M512PermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(a[127:0], imm8[5:4]) dst[127:96] := SELECT4(a[127:0], imm8[7:6]) dst[159:128] := SELECT4(a[255:128], imm8[1:0]) dst[191:160] := SELECT4(a[255:128], imm8[3:2]) dst[223:192] := SELECT4(a[255:128], imm8[5:4]) dst[255:224] := SELECT4(a[255:128], imm8[7:6]) dst[287:256] := SELECT4(a[383:256], imm8[1:0]) dst[319:288] := SELECT4(a[383:256], imm8[3:2]) dst[351:320] := SELECT4(a[383:256], imm8[5:4]) dst[383:352] := SELECT4(a[383:256], imm8[7:6]) dst[415:384] := SELECT4(a[511:384], imm8[1:0]) dst[447:416] := SELECT4(a[511:384], imm8[3:2]) dst[479:448] := SELECT4(a[511:384], imm8[5:4]) dst[511:480] := SELECT4(a[511:384], imm8[7:6]) dst[MAX:512] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm512_permute_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512PermutevarPd ¶
M512PermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst'.
IF (b[1] == 0) dst[63:0] := a[63:0] IF (b[1] == 1) dst[63:0] := a[127:64] IF (b[65] == 0) dst[127:64] := a[63:0] IF (b[65] == 1) dst[127:64] := a[127:64] IF (b[129] == 0) dst[191:128] := a[191:128] IF (b[129] == 1) dst[191:128] := a[255:192] IF (b[193] == 0) dst[255:192] := a[191:128] IF (b[193] == 1) dst[255:192] := a[255:192] IF (b[257] == 0) dst[319:256] := a[319:256] IF (b[257] == 1) dst[319:256] := a[383:320] IF (b[321] == 0) dst[383:320] := a[319:256] IF (b[321] == 1) dst[383:320] := a[383:320] IF (b[385] == 0) dst[447:384] := a[447:384] IF (b[385] == 1) dst[447:384] := a[511:448] IF (b[449] == 0) dst[511:448] := a[447:384] IF (b[449] == 1) dst[511:448] := a[511:448] dst[MAX:512] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm512_permutevar_pd'. Requires AVX512F.
func M512PermutevarPs ¶
M512PermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], b[1:0]) dst[63:32] := SELECT4(a[127:0], b[33:32]) dst[95:64] := SELECT4(a[127:0], b[65:64]) dst[127:96] := SELECT4(a[127:0], b[97:96]) dst[159:128] := SELECT4(a[255:128], b[129:128]) dst[191:160] := SELECT4(a[255:128], b[161:160]) dst[223:192] := SELECT4(a[255:128], b[193:192]) dst[255:224] := SELECT4(a[255:128], b[225:224]) dst[287:256] := SELECT4(a[383:256], b[257:256]) dst[319:288] := SELECT4(a[383:256], b[289:288]) dst[351:320] := SELECT4(a[383:256], b[321:320]) dst[383:352] := SELECT4(a[383:256], b[353:352]) dst[415:384] := SELECT4(a[511:384], b[385:384]) dst[447:416] := SELECT4(a[511:384], b[417:416]) dst[479:448] := SELECT4(a[511:384], b[449:448]) dst[511:480] := SELECT4(a[511:384], b[481:480]) dst[MAX:512] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm512_permutevar_ps'. Requires AVX512F.
func M512Permutex2varEpi32 ¶
M512Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm512_permutex2var_epi32'. Requires AVX512F.
func M512Permutex2varEpi64 ¶
M512Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm512_permutex2var_epi64'. Requires AVX512F.
func M512Permutex2varPd ¶
M512Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 off := idx[i+2:i]*64 dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm512_permutex2var_pd'. Requires AVX512F.
func M512Permutex2varPs ¶
M512Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 off := idx[i+3:i]*32 dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm512_permutex2var_ps'. Requires AVX512F.
func M512PermutexEpi64 ¶
M512PermutexEpi64: Shuffle 64-bit integers in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } dst[63:0] := SELECT4(a[255:0], imm8[1:0]) dst[127:64] := SELECT4(a[255:0], imm8[3:2]) dst[191:128] := SELECT4(a[255:0], imm8[5:4]) dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[319:256] := SELECT4(a[511:256], imm8[1:0]) dst[383:320] := SELECT4(a[511:256], imm8[3:2]) dst[447:384] := SELECT4(a[511:256], imm8[5:4]) dst[511:448] := SELECT4(a[511:256], imm8[7:6]) dst[MAX:512] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm512_permutex_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512PermutexPd ¶
M512PermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[63:0] := src[63:0] 1: tmp[63:0] := src[127:64] 2: tmp[63:0] := src[191:128] 3: tmp[63:0] := src[255:192] ESAC RETURN tmp[63:0] } dst[63:0] := SELECT4(a[255:0], imm8[1:0]) dst[127:64] := SELECT4(a[255:0], imm8[3:2]) dst[191:128] := SELECT4(a[255:0], imm8[5:4]) dst[255:192] := SELECT4(a[255:0], imm8[7:6]) dst[319:256] := SELECT4(a[511:256], imm8[1:0]) dst[383:320] := SELECT4(a[511:256], imm8[3:2]) dst[447:384] := SELECT4(a[511:256], imm8[5:4]) dst[511:448] := SELECT4(a[511:256], imm8[7:6]) dst[MAX:512] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm512_permutex_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512PermutexvarEpi32 ¶
M512PermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMD'. Intrinsic: '_mm512_permutexvar_epi32'. Requires AVX512F.
func M512PermutexvarEpi64 ¶
M512PermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 dst[i+63:i] := a[id+63:id] ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMQ'. Intrinsic: '_mm512_permutexvar_epi64'. Requires AVX512F.
func M512PermutexvarPd ¶
M512PermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 id := idx[i+2:i]*64 dst[i+63:i] := a[id+63:id] ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMPD'. Intrinsic: '_mm512_permutexvar_pd'. Requires AVX512F.
func M512PermutexvarPs ¶
M512PermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx'.
FOR j := 0 to 15 i := j*32 id := idx[i+3:i]*32 dst[i+31:i] := a[id+31:id] ENDFOR dst[MAX:512] := 0
Instruction: 'VPERMPS'. Intrinsic: '_mm512_permutexvar_ps'. Requires AVX512F.
func M512PowPd ¶
M512PowPd: Compute the exponential value of packed double-precision (64-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := (a[i+63:i])^(b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_pow_pd'. Requires AVX512F.
func M512PowPs ¶
M512PowPs: Compute the exponential value of packed single-precision (32-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := (a[i+31:i])^(b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_pow_ps'. Requires AVX512F.
func M512Rcp14Pd ¶
M512Rcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VRCP14PD'. Intrinsic: '_mm512_rcp14_pd'. Requires AVX512F.
func M512Rcp14Ps ¶
M512Rcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VRCP14PS'. Intrinsic: '_mm512_rcp14_ps'. Requires AVX512F.
func M512RecipPd ¶
M512RecipPd: Computes the reciprocal of packed double-precision (64-bit) floating-point elements in 'a', storing the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := (1 / a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_recip_pd'. Requires AVX512F.
func M512RecipPs ¶
M512RecipPs: Computes the reciprocal of packed single-precision (32-bit) floating-point elements in 'a', storing the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := (1 / a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_recip_ps'. Requires AVX512F.
func M512RemEpi16 ¶
M512RemEpi16: Divide packed 16-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.
FOR j := 0 to 31 i := 16*j dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_rem_epi16'. Requires AVX512F.
func M512RemEpi32 ¶
M512RemEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_rem_epi32'. Requires AVX512F.
func M512RemEpi64 ¶
M512RemEpi64: Divide packed 64-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.
FOR j := 0 to 7 i := 64*j dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_rem_epi64'. Requires AVX512F.
func M512RemEpi8 ¶
M512RemEpi8: Divide packed 8-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.
FOR j := 0 to 63 i := 8*j dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_rem_epi8'. Requires AVX512F.
func M512RemEpu16 ¶
M512RemEpu16: Divide packed unsigned 16-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.
FOR j := 0 to 31 i := 16*j dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_rem_epu16'. Requires AVX512F.
func M512RemEpu32 ¶
M512RemEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.
FOR j := 0 to 15 i := 32*j dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_rem_epu32'. Requires AVX512F.
func M512RemEpu64 ¶
M512RemEpu64: Divide packed unsigned 64-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.
FOR j := 0 to 7 i := 64*j dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_rem_epu64'. Requires AVX512F.
func M512RemEpu8 ¶
M512RemEpu8: Divide packed unsigned 8-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.
FOR j := 0 to 63 i := 8*j dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_rem_epu8'. Requires AVX512F.
func M512RintPd ¶
M512RintPd: Rounds the packed double-precision (64-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := RoundToNearestEven(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_rint_pd'. Requires AVX512F.
func M512RintPs ¶
M512RintPs: Rounds the packed single-precision (32-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := RoundToNearestEven(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_rint_ps'. Requires AVX512F.
func M512RolEpi32 ¶
M512RolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLD'. Intrinsic: '_mm512_rol_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512RolEpi64 ¶
M512RolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLQ'. Intrinsic: '_mm512_rol_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512RolvEpi32 ¶
M512RolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 15 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLVD'. Intrinsic: '_mm512_rolv_epi32'. Requires AVX512F.
func M512RolvEpi64 ¶
M512RolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 7 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPROLVQ'. Intrinsic: '_mm512_rolv_epi64'. Requires AVX512F.
func M512RorEpi32 ¶
M512RorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORD'. Intrinsic: '_mm512_ror_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512RorEpi64 ¶
M512RorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORQ'. Intrinsic: '_mm512_ror_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512RorvEpi32 ¶
M512RorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 15 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORVD'. Intrinsic: '_mm512_rorv_epi32'. Requires AVX512F.
func M512RorvEpi64 ¶
M512RorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 7 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPRORVQ'. Intrinsic: '_mm512_rorv_epi64'. Requires AVX512F.
func M512RoundscalePd ¶
M512RoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_roundscale_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512RoundscalePs ¶
M512RoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_roundscale_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512RoundscaleRoundPd ¶
M512RoundscaleRoundPd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_roundscale_round_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512RoundscaleRoundPs ¶
M512RoundscaleRoundPs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:512] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_roundscale_round_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512Rsqrt14Pd ¶
M512Rsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ENDFOR dst[MAX:512] := 0
Instruction: 'VRSQRT14PD'. Intrinsic: '_mm512_rsqrt14_pd'. Requires AVX512F.
func M512Rsqrt14Ps ¶
M512Rsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ENDFOR dst[MAX:512] := 0
Instruction: 'VRSQRT14PS'. Intrinsic: '_mm512_rsqrt14_ps'. Requires AVX512F.
func M512ScalefPd ¶
M512ScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_scalef_pd'. Requires AVX512F.
func M512ScalefPs ¶
M512ScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_scalef_ps'. Requires AVX512F.
func M512ScalefRoundPd ¶
M512ScalefRoundPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 7 i := j*64 dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_scalef_round_pd'. Requires AVX512F.
func M512ScalefRoundPs ¶
M512ScalefRoundPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 15 i := j*32 dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_scalef_round_ps'. Requires AVX512F.
func M512Set1Epi16 ¶
M512Set1Epi16: Broadcast the low packed 16-bit integer from 'a' to all all elements of 'dst'.
FOR j := 0 to 31 i := j*16 dst[i+15:i] := a[15:0] ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set1_epi16'. Requires AVX512F.
func M512Set1Epi32 ¶
M512Set1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_set1_epi32'. Requires AVX512F.
func M512Set1Epi64 ¶
M512Set1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:512] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_set1_epi64'. Requires AVX512F.
func M512Set1Epi8 ¶
M512Set1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst'.
FOR j := 0 to 63 i := j*8 dst[i+7:i] := a[7:0] ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set1_epi8'. Requires AVX512F.
func M512Set1Pd ¶
M512Set1Pd: Broadcast double-precision (64-bit) floating-point value 'a' to all elements of 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[63:0] ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set1_pd'. Requires AVX512F.
func M512Set1Ps ¶
M512Set1Ps: Broadcast single-precision (32-bit) floating-point value 'a' to all elements of 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := a[31:0] ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set1_ps'. Requires AVX512F.
func M512Set4Epi32 ¶
M512Set4Epi32: Set packed 32-bit integers in 'dst' with the repeated 4 element sequence.
dst[31:0] := d dst[63:32] := c dst[95:64] := b dst[127:96] := a dst[159:128] := d dst[191:160] := c dst[223:192] := b dst[255:224] := a dst[287:256] := d dst[319:288] := c dst[351:320] := b dst[383:352] := a dst[415:384] := d dst[447:416] := c dst[479:448] := b dst[511:480] := a dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set4_epi32'. Requires AVX512F.
func M512Set4Epi64 ¶
M512Set4Epi64: Set packed 64-bit integers in 'dst' with the repeated 4 element sequence.
dst[63:0] := d dst[127:64] := c dst[191:128] := b dst[255:192] := a dst[319:256] := d dst[383:320] := c dst[447:384] := b dst[511:448] := a dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set4_epi64'. Requires AVX512F.
func M512Set4Pd ¶
M512Set4Pd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the repeated 4 element sequence.
dst[63:0] := d dst[127:64] := c dst[191:128] := b dst[255:192] := a dst[319:256] := d dst[383:320] := c dst[447:384] := b dst[511:448] := a dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set4_pd'. Requires AVX512F.
func M512Set4Ps ¶
M512Set4Ps: Set packed single-precision (32-bit) floating-point elements in 'dst' with the repeated 4 element sequence.
dst[31:0] := d dst[63:32] := c dst[95:64] := b dst[127:96] := a dst[159:128] := d dst[191:160] := c dst[223:192] := b dst[255:224] := a dst[287:256] := d dst[319:288] := c dst[351:320] := b dst[383:352] := a dst[415:384] := d dst[447:416] := c dst[479:448] := b dst[511:480] := a dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set4_ps'. Requires AVX512F.
func M512SetEpi32 ¶
func M512SetEpi32(e15 int, e14 int, e13 int, e12 int, e11 int, e10 int, e9 int, e8 int, e7 int, e6 int, e5 int, e4 int, e3 int, e2 int, e1 int, e0 int) (dst x86.M512i)
M512SetEpi32: Set packed 32-bit integers in 'dst' with the supplied values.
dst[31:0] := e0 dst[63:32] := e1 dst[95:64] := e2 dst[127:96] := e3 dst[159:128] := e4 dst[191:160] := e5 dst[223:192] := e6 dst[255:224] := e7 dst[287:256] := e8 dst[319:288] := e9 dst[351:320] := e10 dst[383:352] := e11 dst[415:384] := e12 dst[447:416] := e13 dst[479:448] := e14 dst[511:480] := e15 dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set_epi32'. Requires AVX512F.
func M512SetEpi64 ¶
func M512SetEpi64(e7 int64, e6 int64, e5 int64, e4 int64, e3 int64, e2 int64, e1 int64, e0 int64) (dst x86.M512i)
M512SetEpi64: Set packed 64-bit integers in 'dst' with the supplied values.
dst[63:0] := e0 dst[127:64] := e1 dst[191:128] := e2 dst[255:192] := e3 dst[319:256] := e4 dst[383:320] := e5 dst[447:384] := e6 dst[511:448] := e7 dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set_epi64'. Requires AVX512F.
func M512SetPd ¶
func M512SetPd(e7 float64, e6 float64, e5 float64, e4 float64, e3 float64, e2 float64, e1 float64, e0 float64) (dst x86.M512d)
M512SetPd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the supplied values.
dst[63:0] := e0 dst[127:64] := e1 dst[191:128] := e2 dst[255:192] := e3 dst[319:256] := e4 dst[383:320] := e5 dst[447:384] := e6 dst[511:448] := e7 dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set_pd'. Requires AVX512F.
func M512SetPs ¶
func M512SetPs(e15 float32, e14 float32, e13 float32, e12 float32, e11 float32, e10 float32, e9 float32, e8 float32, e7 float32, e6 float32, e5 float32, e4 float32, e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M512)
M512SetPs: Set packed single-precision (32-bit) floating-point elements in 'dst' with the supplied values.
dst[31:0] := e0 dst[63:32] := e1 dst[95:64] := e2 dst[127:96] := e3 dst[159:128] := e4 dst[191:160] := e5 dst[223:192] := e6 dst[255:224] := e7 dst[287:256] := e8 dst[319:288] := e9 dst[351:320] := e10 dst[383:352] := e11 dst[415:384] := e12 dst[447:416] := e13 dst[479:448] := e14 dst[511:480] := e15 dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_set_ps'. Requires AVX512F.
func M512Setr4Epi32 ¶
M512Setr4Epi32: Set packed 32-bit integers in 'dst' with the repeated 4 element sequence in reverse order.
dst[31:0] := a dst[63:32] := b dst[95:64] := c dst[127:96] := d dst[159:128] := a dst[191:160] := b dst[223:192] := c dst[255:224] := d dst[287:256] := a dst[319:288] := b dst[351:320] := c dst[383:352] := d dst[415:384] := a dst[447:416] := b dst[479:448] := c dst[511:480] := d dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_setr4_epi32'. Requires AVX512F.
func M512Setr4Epi64 ¶
M512Setr4Epi64: Set packed 64-bit integers in 'dst' with the repeated 4 element sequence in reverse order.
dst[63:0] := a dst[127:64] := b dst[191:128] := c dst[255:192] := d dst[319:256] := a dst[383:320] := b dst[447:384] := c dst[511:448] := d dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_setr4_epi64'. Requires AVX512F.
func M512Setr4Pd ¶
M512Setr4Pd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the repeated 4 element sequence in reverse order.
dst[63:0] := a dst[127:64] := b dst[191:128] := c dst[255:192] := d dst[319:256] := a dst[383:320] := b dst[447:384] := c dst[511:448] := d dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_setr4_pd'. Requires AVX512F.
func M512Setr4Ps ¶
M512Setr4Ps: Set packed single-precision (32-bit) floating-point elements in 'dst' with the repeated 4 element sequence in reverse order.
dst[31:0] := a dst[63:32] := b dst[95:64] := c dst[127:96] := d dst[159:128] := a dst[191:160] := b dst[223:192] := c dst[255:224] := d dst[287:256] := a dst[319:288] := b dst[351:320] := c dst[383:352] := d dst[415:384] := a dst[447:416] := b dst[479:448] := c dst[511:480] := d dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_setr4_ps'. Requires AVX512F.
func M512SetrEpi32 ¶
func M512SetrEpi32(e15 int, e14 int, e13 int, e12 int, e11 int, e10 int, e9 int, e8 int, e7 int, e6 int, e5 int, e4 int, e3 int, e2 int, e1 int, e0 int) (dst x86.M512i)
M512SetrEpi32: Set packed 32-bit integers in 'dst' with the supplied values in reverse order.
dst[31:0] := e15 dst[63:32] := e14 dst[95:64] := e13 dst[127:96] := e12 dst[159:128] := e11 dst[191:160] := e10 dst[223:192] := e9 dst[255:224] := e8 dst[287:256] := e7 dst[319:288] := e6 dst[351:320] := e5 dst[383:352] := e4 dst[415:384] := e3 dst[447:416] := e2 dst[479:448] := e1 dst[511:480] := e0 dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_setr_epi32'. Requires AVX512F.
func M512SetrEpi64 ¶
func M512SetrEpi64(e7 int64, e6 int64, e5 int64, e4 int64, e3 int64, e2 int64, e1 int64, e0 int64) (dst x86.M512i)
M512SetrEpi64: Set packed 64-bit integers in 'dst' with the supplied values in reverse order.
dst[63:0] := e7 dst[127:64] := e6 dst[191:128] := e5 dst[255:192] := e4 dst[319:256] := e3 dst[383:320] := e2 dst[447:384] := e1 dst[511:448] := e0 dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_setr_epi64'. Requires AVX512F.
func M512SetrPd ¶
func M512SetrPd(e7 float64, e6 float64, e5 float64, e4 float64, e3 float64, e2 float64, e1 float64, e0 float64) (dst x86.M512d)
M512SetrPd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the supplied values in reverse order.
dst[63:0] := e7 dst[127:64] := e6 dst[191:128] := e5 dst[255:192] := e4 dst[319:256] := e3 dst[383:320] := e2 dst[447:384] := e1 dst[511:448] := e0 dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_setr_pd'. Requires AVX512F.
func M512SetrPs ¶
func M512SetrPs(e15 float32, e14 float32, e13 float32, e12 float32, e11 float32, e10 float32, e9 float32, e8 float32, e7 float32, e6 float32, e5 float32, e4 float32, e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M512)
M512SetrPs: Set packed single-precision (32-bit) floating-point elements in 'dst' with the supplied values in reverse order.
dst[31:0] := e15 dst[63:32] := e14 dst[95:64] := e13 dst[127:96] := e12 dst[159:128] := e11 dst[191:160] := e10 dst[223:192] := e9 dst[255:224] := e8 dst[287:256] := e7 dst[319:288] := e6 dst[351:320] := e5 dst[383:352] := e4 dst[415:384] := e3 dst[447:416] := e2 dst[479:448] := e1 dst[511:480] := e0 dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_setr_ps'. Requires AVX512F.
func M512Setzero ¶
M512Setzero: Return vector of type __m512 with all elements set to zero.
dst[MAX:0] := 0
Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero'. Requires AVX512F.
func M512SetzeroEpi32 ¶
M512SetzeroEpi32: Return vector of type __m512i with all elements set to zero.
dst[MAX:0] := 0
Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_epi32'. Requires AVX512F.
func M512SetzeroPd ¶
M512SetzeroPd: Return vector of type __m512d with all elements set to zero.
dst[MAX:0] := 0
Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_pd'. Requires AVX512F.
func M512SetzeroPs ¶
M512SetzeroPs: Return vector of type __m512 with all elements set to zero.
dst[MAX:0] := 0
Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_ps'. Requires AVX512F.
func M512SetzeroSi512 ¶
M512SetzeroSi512: Return vector of type __m512i with all elements set to zero.
dst[MAX:0] := 0
Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_si512'. Requires AVX512F.
func M512ShuffleF32x4 ¶
M512ShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT4(a[511:0], imm8[1:0]) dst[255:128] := SELECT4(a[511:0], imm8[3:2]) dst[383:256] := SELECT4(b[511:0], imm8[5:4]) dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0
Instruction: 'VSHUFF32X4'. Intrinsic: '_mm512_shuffle_f32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512ShuffleF64x2 ¶
M512ShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT4(a[511:0], imm8[1:0]) dst[255:128] := SELECT4(a[511:0], imm8[3:2]) dst[383:256] := SELECT4(b[511:0], imm8[5:4]) dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0
Instruction: 'VSHUFF64X2'. Intrinsic: '_mm512_shuffle_f64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512ShuffleI32x4 ¶
M512ShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT4(a[511:0], imm8[1:0]) dst[255:128] := SELECT4(a[511:0], imm8[3:2]) dst[383:256] := SELECT4(b[511:0], imm8[5:4]) dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0
Instruction: 'VSHUFI32X4'. Intrinsic: '_mm512_shuffle_i32x4'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512ShuffleI64x2 ¶
M512ShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[127:0] := src[127:0] 1: tmp[127:0] := src[255:128] 2: tmp[127:0] := src[383:256] 3: tmp[127:0] := src[511:384] ESAC RETURN tmp[127:0] } dst[127:0] := SELECT4(a[511:0], imm8[1:0]) dst[255:128] := SELECT4(a[511:0], imm8[3:2]) dst[383:256] := SELECT4(b[511:0], imm8[5:4]) dst[511:384] := SELECT4(b[511:0], imm8[7:6]) dst[MAX:512] := 0
Instruction: 'VSHUFI64X2'. Intrinsic: '_mm512_shuffle_i64x2'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512ShufflePd ¶
M512ShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.
dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] dst[MAX:512] := 0
Instruction: 'VSHUFPD'. Intrinsic: '_mm512_shuffle_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512ShufflePs ¶
M512ShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } dst[31:0] := SELECT4(a[127:0], imm8[1:0]) dst[63:32] := SELECT4(a[127:0], imm8[3:2]) dst[95:64] := SELECT4(b[127:0], imm8[5:4]) dst[127:96] := SELECT4(b[127:0], imm8[7:6]) dst[159:128] := SELECT4(a[255:128], imm8[1:0]) dst[191:160] := SELECT4(a[255:128], imm8[3:2]) dst[223:192] := SELECT4(b[255:128], imm8[5:4]) dst[255:224] := SELECT4(b[255:128], imm8[7:6]) dst[287:256] := SELECT4(a[383:256], imm8[1:0]) dst[319:288] := SELECT4(a[383:256], imm8[3:2]) dst[351:320] := SELECT4(b[383:256], imm8[5:4]) dst[383:352] := SELECT4(b[383:256], imm8[7:6]) dst[415:384] := SELECT4(a[511:384], imm8[1:0]) dst[447:416] := SELECT4(a[511:384], imm8[3:2]) dst[479:448] := SELECT4(b[511:384], imm8[5:4]) dst[511:480] := SELECT4(b[511:384], imm8[7:6]) dst[MAX:512] := 0
Instruction: 'VSHUFPS'. Intrinsic: '_mm512_shuffle_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512SinPd ¶
M512SinPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := SIN(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_sin_pd'. Requires AVX512F.
func M512SinPs ¶
M512SinPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := SIN(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_sin_ps'. Requires AVX512F.
func M512SincosPd ¶
M512SincosPd: Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := SIN(a[i+63:i]) cos_res[i+63:i] := COS(a[i+63:i]) ENDFOR dst[MAX:512] := 0 cos_res[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_sincos_pd'. Requires AVX512F.
FIXME: Will likely need to be reworked (has pointer parameter).
func M512SincosPs ¶
M512SincosPs: Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := SIN(a[i+31:i]) cos_res[i+31:i] := COS(a[i+31:i]) ENDFOR dst[MAX:512] := 0 cos_res[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_sincos_ps'. Requires AVX512F.
FIXME: Will likely need to be reworked (has pointer parameter).
func M512SindPd ¶
M512SindPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := SIND(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_sind_pd'. Requires AVX512F.
func M512SindPs ¶
M512SindPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := SIND(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_sind_ps'. Requires AVX512F.
func M512SinhPd ¶
M512SinhPd: Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := SINH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_sinh_pd'. Requires AVX512F.
func M512SinhPs ¶
M512SinhPs: Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := SINH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_sinh_ps'. Requires AVX512F.
func M512SllEpi32 ¶
M512SllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm512_sll_epi32'. Requires AVX512F.
func M512SllEpi64 ¶
M512SllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm512_sll_epi64'. Requires AVX512F.
func M512SlliEpi64 ¶
M512SlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm512_slli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512SllvEpi64 ¶
M512SllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPSLLVQ'. Intrinsic: '_mm512_sllv_epi64'. Requires AVX512F.
func M512SqrtPd ¶
M512SqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPD'. Intrinsic: '_mm512_sqrt_pd'. Requires AVX512F.
func M512SqrtPs ¶
M512SqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPS'. Intrinsic: '_mm512_sqrt_ps'. Requires AVX512F.
func M512SqrtRoundPd ¶
M512SqrtRoundPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE. FOR j := 0 to 7 i := j*64 dst[i+63:i] := SQRT(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPD'. Intrinsic: '_mm512_sqrt_round_pd'. Requires AVX512F.
func M512SqrtRoundPs ¶
M512SqrtRoundPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE. FOR j := 0 to 15 i := j*32 dst[i+31:i] := SQRT(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VSQRTPS'. Intrinsic: '_mm512_sqrt_round_ps'. Requires AVX512F.
func M512SraEpi32 ¶
M512SraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm512_sra_epi32'. Requires AVX512F.
func M512SraEpi64 ¶
M512SraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm512_sra_epi64'. Requires AVX512F.
func M512SraiEpi64 ¶
M512SraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm512_srai_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512SravEpi64 ¶
M512SravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRAVQ'. Intrinsic: '_mm512_srav_epi64'. Requires AVX512F.
func M512SrlEpi32 ¶
M512SrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm512_srl_epi32'. Requires AVX512F.
func M512SrlEpi64 ¶
M512SrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm512_srl_epi64'. Requires AVX512F.
func M512SrliEpi64 ¶
M512SrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm512_srli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512SrlvEpi64 ¶
M512SrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: 'VPSRLVQ'. Intrinsic: '_mm512_srlv_epi64'. Requires AVX512F.
func M512SubEpi64 ¶
M512SubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := a[i+63:i] - b[i+63:i] ENDFOR dst[MAX:512] := 0
Instruction: 'VPSUBQ'. Intrinsic: '_mm512_sub_epi64'. Requires AVX512F.
func M512SvmlRoundPd ¶
M512SvmlRoundPd: Round the packed double-precision (64-bit) floating-point elements in 'a' to the nearest integer value, and store the results as packed double-precision floating-point elements in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := ROUND(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_svml_round_pd'. Requires AVX512F.
func M512TanPd ¶
M512TanPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := TAN(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_tan_pd'. Requires AVX512F.
func M512TanPs ¶
M512TanPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := TAN(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_tan_ps'. Requires AVX512F.
func M512TandPd ¶
M512TandPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := TAND(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_tand_pd'. Requires AVX512F.
func M512TandPs ¶
M512TandPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := TAND(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_tand_ps'. Requires AVX512F.
func M512TanhPd ¶
M512TanhPd: Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := TANH(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_tanh_pd'. Requires AVX512F.
func M512TanhPs ¶
M512TanhPs: Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := TANH(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_tanh_ps'. Requires AVX512F.
func M512TernarylogicEpi32 ¶
M512TernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.
FOR j := 0 to 15 i := j*32 FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:512] := 0
Instruction: 'VPTERNLOGD'. Intrinsic: '_mm512_ternarylogic_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512TernarylogicEpi64 ¶
M512TernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.
FOR j := 0 to 7 i := j*64 FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:512] := 0
Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm512_ternarylogic_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func M512TestEpi64Mask ¶
M512TestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.
FOR j := 0 to 7 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPTESTMQ'. Intrinsic: '_mm512_test_epi64_mask'. Requires AVX512F.
func M512TestnEpi32Mask ¶
M512TestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.
FOR j := 0 to 15 i := j*32 k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ENDFOR k[MAX:16] := 0
Instruction: 'VPTESTNMD'. Intrinsic: '_mm512_testn_epi32_mask'. Requires AVX512F.
func M512TestnEpi64Mask ¶
M512TestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.
FOR j := 0 to 7 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ENDFOR k[MAX:8] := 0
Instruction: 'VPTESTNMQ'. Intrinsic: '_mm512_testn_epi64_mask'. Requires AVX512F.
func M512TruncPd ¶
M512TruncPd: Truncate the packed double-precision (64-bit) floating-point elements in 'a', and store the results as packed double-precision floating-point elements in 'dst'.
FOR j := 0 to 7 i := j*64 dst[i+63:i] := TRUNCATE(a[i+63:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_trunc_pd'. Requires AVX512F.
func M512TruncPs ¶
M512TruncPs: Truncate the packed single-precision (32-bit) floating-point elements in 'a', and store the results as packed single-precision floating-point elements in 'dst'.
FOR j := 0 to 15 i := j*32 dst[i+31:i] := TRUNCATE(a[i+31:i]) ENDFOR dst[MAX:512] := 0
Instruction: '...'. Intrinsic: '_mm512_trunc_ps'. Requires AVX512F.
func M512Undefined ¶
M512Undefined: Return vector of type __m512 with undefined elements.
Instruction: ”. Intrinsic: '_mm512_undefined'. Requires AVX512F.
func M512UndefinedEpi32 ¶
M512UndefinedEpi32: Return vector of type __m512i with undefined elements.
Instruction: ”. Intrinsic: '_mm512_undefined_epi32'. Requires AVX512F.
func M512UndefinedPd ¶
M512UndefinedPd: Return vector of type __m512d with undefined elements.
Instruction: ”. Intrinsic: '_mm512_undefined_pd'. Requires AVX512F.
func M512UndefinedPs ¶
M512UndefinedPs: Return vector of type __m512 with undefined elements.
Instruction: ”. Intrinsic: '_mm512_undefined_ps'. Requires AVX512F.
func M512UnpackhiEpi32 ¶
M512UnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm512_unpackhi_epi32'. Requires AVX512F.
func M512UnpackhiEpi64 ¶
M512UnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm512_unpackhi_epi64'. Requires AVX512F.
func M512UnpackhiPd ¶
M512UnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
Instruction: 'VUNPCKHPD'. Intrinsic: '_mm512_unpackhi_pd'. Requires AVX512F.
func M512UnpackhiPs ¶
M512UnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
Instruction: 'VUNPCKHPS'. Intrinsic: '_mm512_unpackhi_ps'. Requires AVX512F.
func M512UnpackloEpi32 ¶
M512UnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm512_unpacklo_epi32'. Requires AVX512F.
func M512UnpackloEpi64 ¶
M512UnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm512_unpacklo_epi64'. Requires AVX512F.
func M512UnpackloPd ¶
M512UnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
Instruction: 'VUNPCKLPD'. Intrinsic: '_mm512_unpacklo_pd'. Requires AVX512F.
func M512UnpackloPs ¶
M512UnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) dst[MAX:512] := 0
Instruction: 'VUNPCKLPS'. Intrinsic: '_mm512_unpacklo_ps'. Requires AVX512F.
func Mask2Permutex2varEpi32 ¶
Mask2Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2D'. Intrinsic: '_mm_mask2_permutex2var_epi32'. Requires AVX512F.
func Mask2Permutex2varEpi64 ¶
Mask2Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2Q'. Intrinsic: '_mm_mask2_permutex2var_epi64'. Requires AVX512F.
func Mask2Permutex2varPd ¶
Mask2Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set)
FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := idx[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2PD'. Intrinsic: '_mm_mask2_permutex2var_pd'. Requires AVX512F.
func Mask2Permutex2varPs ¶
Mask2Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := idx[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2PS'. Intrinsic: '_mm_mask2_permutex2var_ps'. Requires AVX512F.
func Mask3FmaddPd ¶
Mask3FmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm_mask3_fmadd_pd'. Requires AVX512F.
func Mask3FmaddPs ¶
Mask3FmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm_mask3_fmadd_ps'. Requires AVX512F.
func Mask3FmaddRoundSd ¶
func Mask3FmaddRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
Mask3FmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask3_fmadd_round_sd'. Requires AVX512F.
func Mask3FmaddRoundSs ¶
func Mask3FmaddRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
Mask3FmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask3_fmadd_round_ss'. Requires AVX512F.
func Mask3FmaddSd ¶
Mask3FmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask3_fmadd_sd'. Requires AVX512F.
func Mask3FmaddSs ¶
Mask3FmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask3_fmadd_ss'. Requires AVX512F.
func Mask3FmaddsubPd ¶
Mask3FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm_mask3_fmaddsub_pd'. Requires AVX512F.
func Mask3FmaddsubPs ¶
Mask3FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm_mask3_fmaddsub_ps'. Requires AVX512F.
func Mask3FmsubPd ¶
Mask3FmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm_mask3_fmsub_pd'. Requires AVX512F.
func Mask3FmsubPs ¶
Mask3FmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm_mask3_fmsub_ps'. Requires AVX512F.
func Mask3FmsubRoundSd ¶
func Mask3FmsubRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
Mask3FmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask3_fmsub_round_sd'. Requires AVX512F.
func Mask3FmsubRoundSs ¶
func Mask3FmsubRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
Mask3FmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask3_fmsub_round_ss'. Requires AVX512F.
func Mask3FmsubSd ¶
Mask3FmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask3_fmsub_sd'. Requires AVX512F.
func Mask3FmsubSs ¶
Mask3FmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask3_fmsub_ss'. Requires AVX512F.
func Mask3FmsubaddPd ¶
Mask3FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm_mask3_fmsubadd_pd'. Requires AVX512F.
func Mask3FmsubaddPs ¶
Mask3FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm_mask3_fmsubadd_ps'. Requires AVX512F.
func Mask3FnmaddPd ¶
Mask3FnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm_mask3_fnmadd_pd'. Requires AVX512F.
func Mask3FnmaddPs ¶
Mask3FnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm_mask3_fnmadd_ps'. Requires AVX512F.
func Mask3FnmaddRoundSd ¶
func Mask3FnmaddRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
Mask3FnmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask3_fnmadd_round_sd'. Requires AVX512F.
func Mask3FnmaddRoundSs ¶
func Mask3FnmaddRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
Mask3FnmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask3_fnmadd_round_ss'. Requires AVX512F.
func Mask3FnmaddSd ¶
Mask3FnmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask3_fnmadd_sd'. Requires AVX512F.
func Mask3FnmaddSs ¶
Mask3FnmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask3_fnmadd_ss'. Requires AVX512F.
func Mask3FnmsubPd ¶
Mask3FnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := c[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm_mask3_fnmsub_pd'. Requires AVX512F.
func Mask3FnmsubPs ¶
Mask3FnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := c[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm_mask3_fnmsub_ps'. Requires AVX512F.
func Mask3FnmsubRoundSd ¶
func Mask3FnmsubRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
Mask3FnmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask3_fnmsub_round_sd'. Requires AVX512F.
func Mask3FnmsubRoundSs ¶
func Mask3FnmsubRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
Mask3FnmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', subtract the lower element in 'c' from the negated intermediate result, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask3_fnmsub_round_ss'. Requires AVX512F.
func Mask3FnmsubSd ¶
Mask3FnmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := c[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask3_fnmsub_sd'. Requires AVX512F.
func Mask3FnmsubSs ¶
Mask3FnmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).
IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := c[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask3_fnmsub_ss'. Requires AVX512F.
func MaskAbsEpi32 ¶
MaskAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPABSD'. Intrinsic: '_mm_mask_abs_epi32'. Requires AVX512F.
func MaskAbsEpi64 ¶
MaskAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPABSQ'. Intrinsic: '_mm_mask_abs_epi64'. Requires AVX512F.
func MaskAddEpi32 ¶
MaskAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPADDD'. Intrinsic: '_mm_mask_add_epi32'. Requires AVX512F.
func MaskAddEpi64 ¶
MaskAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPADDQ'. Intrinsic: '_mm_mask_add_epi64'. Requires AVX512F.
func MaskAddRoundSd ¶
func MaskAddRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
MaskAddRoundSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := a[63:0] + b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VADDSD'. Intrinsic: '_mm_mask_add_round_sd'. Requires AVX512F.
func MaskAddRoundSs ¶
func MaskAddRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
MaskAddRoundSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := a[31:0] + b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VADDSS'. Intrinsic: '_mm_mask_add_round_ss'. Requires AVX512F.
func MaskAddSd ¶
MaskAddSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := a[63:0] + b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VADDSD'. Intrinsic: '_mm_mask_add_sd'. Requires AVX512F.
func MaskAddSs ¶
MaskAddSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := a[31:0] + b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VADDSS'. Intrinsic: '_mm_mask_add_ss'. Requires AVX512F.
func MaskAndEpi32 ¶
MaskAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] AND b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPANDD'. Intrinsic: '_mm_mask_and_epi32'. Requires AVX512F.
func MaskAndEpi64 ¶
MaskAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPANDQ'. Intrinsic: '_mm_mask_and_epi64'. Requires AVX512F.
func MaskAndnotEpi32 ¶
MaskAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPANDND'. Intrinsic: '_mm_mask_andnot_epi32'. Requires AVX512F.
func MaskAndnotEpi64 ¶
MaskAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPANDNQ'. Intrinsic: '_mm_mask_andnot_epi64'. Requires AVX512F.
func MaskBlendEpi32 ¶
MaskBlendEpi32: Blend packed 32-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPBLENDMD'. Intrinsic: '_mm_mask_blend_epi32'. Requires AVX512F.
func MaskBlendEpi64 ¶
MaskBlendEpi64: Blend packed 64-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPBLENDMQ'. Intrinsic: '_mm_mask_blend_epi64'. Requires AVX512F.
func MaskBlendPd ¶
MaskBlendPd: Blend packed double-precision (64-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := b[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VBLENDMPD'. Intrinsic: '_mm_mask_blend_pd'. Requires AVX512F.
func MaskBlendPs ¶
MaskBlendPs: Blend packed single-precision (32-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := b[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VBLENDMPS'. Intrinsic: '_mm_mask_blend_ps'. Requires AVX512F.
func MaskBroadcastdEpi32 ¶
MaskBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_mask_broadcastd_epi32'. Requires AVX512F.
func MaskBroadcastqEpi64 ¶
MaskBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_mask_broadcastq_epi64'. Requires AVX512F.
func MaskBroadcastssPs ¶
MaskBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VBROADCASTSS'. Intrinsic: '_mm_mask_broadcastss_ps'. Requires AVX512F.
func MaskCmpEpi32Mask ¶
MaskCmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmp_epi32_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskCmpEpi64Mask ¶
MaskCmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmp_epi64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskCmpEpu32Mask ¶
MaskCmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmp_epu32_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskCmpEpu64Mask ¶
MaskCmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _MM_CMPINT_EQ 1: OP := _MM_CMPINT_LT 2: OP := _MM_CMPINT_LE 3: OP := _MM_CMPINT_FALSE 4: OP := _MM_CMPINT_NEQ 5: OP := _MM_CMPINT_NLT 6: OP := _MM_CMPINT_NLE 7: OP := _MM_CMPINT_TRUE ESAC FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmp_epu64_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskCmpPdMask ¶
MaskCmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VCMPPD'. Intrinsic: '_mm_mask_cmp_pd_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskCmpPsMask ¶
MaskCmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VCMPPS'. Intrinsic: '_mm_mask_cmp_ps_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskCmpRoundSdMask ¶
func MaskCmpRoundSdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, sae int) (dst x86.Mmask8)
MaskCmpRoundSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC IF k1[0] k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 ELSE k[0] := 0 FI k[MAX:1] := 0
Instruction: 'VCMPSD'. Intrinsic: '_mm_mask_cmp_round_sd_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskCmpRoundSsMask ¶
MaskCmpRoundSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC IF k1[0] k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 ELSE k[0] := 0 FI k[MAX:1] := 0
Instruction: 'VCMPSS'. Intrinsic: '_mm_mask_cmp_round_ss_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskCmpSdMask ¶
MaskCmpSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC IF k1[0] k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 ELSE k[0] := 0 FI k[MAX:1] := 0
Instruction: 'VCMPSD'. Intrinsic: '_mm_mask_cmp_sd_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskCmpSsMask ¶
MaskCmpSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).
CASE (imm8[7:0]) OF 0: OP := _CMP_EQ_OQ 1: OP := _CMP_LT_OS 2: OP := _CMP_LE_OS 3: OP := _CMP_UNORD_Q 4: OP := _CMP_NEQ_UQ 5: OP := _CMP_NLT_US 6: OP := _CMP_NLE_US 7: OP := _CMP_ORD_Q 8: OP := _CMP_EQ_UQ 9: OP := _CMP_NGE_US 10: OP := _CMP_NGT_US 11: OP := _CMP_FALSE_OQ 12: OP := _CMP_NEQ_OQ 13: OP := _CMP_GE_OS 14: OP := _CMP_GT_OS 15: OP := _CMP_TRUE_UQ 16: OP := _CMP_EQ_OS 17: OP := _CMP_LT_OQ 18: OP := _CMP_LE_OQ 19: OP := _CMP_UNORD_S 20: OP := _CMP_NEQ_US 21: OP := _CMP_NLT_UQ 22: OP := _CMP_NLE_UQ 23: OP := _CMP_ORD_S 24: OP := _CMP_EQ_US 25: OP := _CMP_NGE_UQ 26: OP := _CMP_NGT_UQ 27: OP := _CMP_FALSE_OS 28: OP := _CMP_NEQ_OS 29: OP := _CMP_GE_OQ 30: OP := _CMP_GT_OQ 31: OP := _CMP_TRUE_US ESAC IF k1[0] k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 ELSE k[0] := 0 FI k[MAX:1] := 0
Instruction: 'VCMPSS'. Intrinsic: '_mm_mask_cmp_ss_mask'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskCmpeqEpi32Mask ¶
MaskCmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpeq_epi32_mask'. Requires AVX512F.
func MaskCmpeqEpi64Mask ¶
MaskCmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpeq_epi64_mask'. Requires AVX512F.
func MaskCmpeqEpu32Mask ¶
MaskCmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpeq_epu32_mask'. Requires AVX512F.
func MaskCmpeqEpu64Mask ¶
MaskCmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpeq_epu64_mask'. Requires AVX512F.
func MaskCmpgeEpi32Mask ¶
MaskCmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpge_epi32_mask'. Requires AVX512F.
func MaskCmpgeEpi64Mask ¶
MaskCmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpge_epi64_mask'. Requires AVX512F.
func MaskCmpgeEpu32Mask ¶
MaskCmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpge_epu32_mask'. Requires AVX512F.
func MaskCmpgeEpu64Mask ¶
MaskCmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpge_epu64_mask'. Requires AVX512F.
func MaskCmpgtEpi32Mask ¶
MaskCmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpgt_epi32_mask'. Requires AVX512F.
func MaskCmpgtEpi64Mask ¶
MaskCmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpgt_epi64_mask'. Requires AVX512F.
func MaskCmpgtEpu32Mask ¶
MaskCmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpgt_epu32_mask'. Requires AVX512F.
func MaskCmpgtEpu64Mask ¶
MaskCmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpgt_epu64_mask'. Requires AVX512F.
func MaskCmpleEpi32Mask ¶
MaskCmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmple_epi32_mask'. Requires AVX512F.
func MaskCmpleEpi64Mask ¶
MaskCmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmple_epi64_mask'. Requires AVX512F.
func MaskCmpleEpu32Mask ¶
MaskCmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmple_epu32_mask'. Requires AVX512F.
func MaskCmpleEpu64Mask ¶
MaskCmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmple_epu64_mask'. Requires AVX512F.
func MaskCmpltEpi32Mask ¶
MaskCmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmplt_epi32_mask'. Requires AVX512F.
func MaskCmpltEpi64Mask ¶
MaskCmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmplt_epi64_mask'. Requires AVX512F.
func MaskCmpltEpu32Mask ¶
MaskCmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmplt_epu32_mask'. Requires AVX512F.
func MaskCmpltEpu64Mask ¶
MaskCmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmplt_epu64_mask'. Requires AVX512F.
func MaskCmpneqEpi32Mask ¶
MaskCmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpneq_epi32_mask'. Requires AVX512F.
func MaskCmpneqEpi64Mask ¶
MaskCmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpneq_epi64_mask'. Requires AVX512F.
func MaskCmpneqEpu32Mask ¶
MaskCmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpneq_epu32_mask'. Requires AVX512F.
func MaskCmpneqEpu64Mask ¶
MaskCmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpneq_epu64_mask'. Requires AVX512F.
func MaskCompressEpi32 ¶
MaskCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 32 m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[127:m] := src[127:m] dst[MAX:128] := 0
Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm_mask_compress_epi32'. Requires AVX512F.
func MaskCompressEpi64 ¶
MaskCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 64 m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[127:m] := src[127:m] dst[MAX:128] := 0
Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm_mask_compress_epi64'. Requires AVX512F.
func MaskCompressPd ¶
MaskCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 64 m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[127:m] := src[127:m] dst[MAX:128] := 0
Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm_mask_compress_pd'. Requires AVX512F.
func MaskCompressPs ¶
MaskCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.
size := 32 m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[127:m] := src[127:m] dst[MAX:128] := 0
Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm_mask_compress_ps'. Requires AVX512F.
func MaskCvtRoundpsPh ¶
MaskCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 3 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_mask_cvt_roundps_ph'. Requires AVX512F.
func MaskCvtRoundsdSs ¶
func MaskCvtRoundsdSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128d, rounding int) (dst x86.M128)
MaskCvtRoundsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := Convert_FP64_To_FP32(b[63:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:31] dst[MAX:64] := 0
Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_mask_cvt_roundsd_ss'. Requires AVX512F.
func MaskCvtRoundssSd ¶
func MaskCvtRoundssSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)
MaskCvtRoundssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:64] := 0
Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_mask_cvt_roundss_sd'. Requires AVX512F.
func MaskCvtepi16Epi32 ¶
MaskCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 l := j*16 IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSXWD'. Intrinsic: '_mm_mask_cvtepi16_epi32'. Requires AVX512F.
func MaskCvtepi16Epi64 ¶
MaskCvtepi16Epi64: Sign extend packed 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm_mask_cvtepi16_epi64'. Requires AVX512F.
func MaskCvtepi32Epi16 ¶
MaskCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVDW'. Intrinsic: '_mm_mask_cvtepi32_epi16'. Requires AVX512F.
func MaskCvtepi32Epi64 ¶
MaskCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm_mask_cvtepi32_epi64'. Requires AVX512F.
func MaskCvtepi32Epi8 ¶
MaskCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVDB'. Intrinsic: '_mm_mask_cvtepi32_epi8'. Requires AVX512F.
func MaskCvtepi32Pd ¶
MaskCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := src[m+63:m] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm_mask_cvtepi32_pd'. Requires AVX512F.
func MaskCvtepi32Ps ¶
MaskCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm_mask_cvtepi32_ps'. Requires AVX512F.
func MaskCvtepi64Epi16 ¶
MaskCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVQW'. Intrinsic: '_mm_mask_cvtepi64_epi16'. Requires AVX512F.
func MaskCvtepi64Epi32 ¶
MaskCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVQD'. Intrinsic: '_mm_mask_cvtepi64_epi32'. Requires AVX512F.
func MaskCvtepi64Epi8 ¶
MaskCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQB'. Intrinsic: '_mm_mask_cvtepi64_epi8'. Requires AVX512F.
func MaskCvtepi8Epi32 ¶
MaskCvtepi8Epi32: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSXBD'. Intrinsic: '_mm_mask_cvtepi8_epi32'. Requires AVX512F.
func MaskCvtepi8Epi64 ¶
MaskCvtepi8Epi64: Sign extend packed 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm_mask_cvtepi8_epi64'. Requires AVX512F.
func MaskCvtepu16Epi32 ¶
MaskCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVZXWD'. Intrinsic: '_mm_mask_cvtepu16_epi32'. Requires AVX512F.
func MaskCvtepu16Epi64 ¶
MaskCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm_mask_cvtepu16_epi64'. Requires AVX512F.
func MaskCvtepu32Epi64 ¶
MaskCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm_mask_cvtepu32_epi64'. Requires AVX512F.
func MaskCvtepu32Pd ¶
MaskCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm_mask_cvtepu32_pd'. Requires AVX512F.
func MaskCvtepu8Epi32 ¶
MaskCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in the low 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVZXBD'. Intrinsic: '_mm_mask_cvtepu8_epi32'. Requires AVX512F.
func MaskCvtepu8Epi64 ¶
MaskCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm_mask_cvtepu8_epi64'. Requires AVX512F.
func MaskCvtpdEpi32 ¶
MaskCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm_mask_cvtpd_epi32'. Requires AVX512F.
func MaskCvtpdEpu32 ¶
MaskCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm_mask_cvtpd_epu32'. Requires AVX512F.
func MaskCvtpdPs ¶
MaskCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPD2PS'. Intrinsic: '_mm_mask_cvtpd_ps'. Requires AVX512F.
func MaskCvtphPs ¶
MaskCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPH2PS'. Intrinsic: '_mm_mask_cvtph_ps'. Requires AVX512F.
func MaskCvtpsEpi32 ¶
MaskCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm_mask_cvtps_epi32'. Requires AVX512F.
func MaskCvtpsEpu32 ¶
MaskCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm_mask_cvtps_epu32'. Requires AVX512F.
func MaskCvtpsPh ¶
MaskCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 3 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := src[i+15:i] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_mask_cvtps_ph'. Requires AVX512F.
func MaskCvtsdSs ¶
MaskCvtsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[31:0] := Convert_FP64_To_FP32(b[63:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:31] dst[MAX:64] := 0
Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_mask_cvtsd_ss'. Requires AVX512F.
func MaskCvtsepi32Epi16 ¶
MaskCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSDW'. Intrinsic: '_mm_mask_cvtsepi32_epi16'. Requires AVX512F.
func MaskCvtsepi32Epi8 ¶
MaskCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVSDB'. Intrinsic: '_mm_mask_cvtsepi32_epi8'. Requires AVX512F.
func MaskCvtsepi64Epi16 ¶
MaskCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVSQW'. Intrinsic: '_mm_mask_cvtsepi64_epi16'. Requires AVX512F.
func MaskCvtsepi64Epi32 ¶
MaskCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSQD'. Intrinsic: '_mm_mask_cvtsepi64_epi32'. Requires AVX512F.
func MaskCvtsepi64Epi8 ¶
MaskCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:16] := 0
Instruction: 'VPMOVSQB'. Intrinsic: '_mm_mask_cvtsepi64_epi8'. Requires AVX512F.
func MaskCvtssSd ¶
MaskCvtssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:64] := 0
Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_mask_cvtss_sd'. Requires AVX512F.
func MaskCvttpdEpi32 ¶
MaskCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm_mask_cvttpd_epi32'. Requires AVX512F.
func MaskCvttpdEpu32 ¶
MaskCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm_mask_cvttpd_epu32'. Requires AVX512F.
func MaskCvttpsEpi32 ¶
MaskCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm_mask_cvttps_epi32'. Requires AVX512F.
func MaskCvttpsEpu32 ¶
MaskCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm_mask_cvttps_epu32'. Requires AVX512F.
func MaskCvtusepi32Epi16 ¶
MaskCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSDW'. Intrinsic: '_mm_mask_cvtusepi32_epi16'. Requires AVX512F.
func MaskCvtusepi32Epi8 ¶
MaskCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVUSDB'. Intrinsic: '_mm_mask_cvtusepi32_epi8'. Requires AVX512F.
func MaskCvtusepi64Epi16 ¶
MaskCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := src[l+15:l] FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVUSQW'. Intrinsic: '_mm_mask_cvtusepi64_epi16'. Requires AVX512F.
func MaskCvtusepi64Epi32 ¶
MaskCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := src[l+31:l] FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSQD'. Intrinsic: '_mm_mask_cvtusepi64_epi32'. Requires AVX512F.
func MaskCvtusepi64Epi8 ¶
MaskCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := src[l+7:l] FI ENDFOR dst[MAX:16] := 0
Instruction: 'VPMOVUSQB'. Intrinsic: '_mm_mask_cvtusepi64_epi8'. Requires AVX512F.
func MaskDivPd ¶
MaskDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VDIVPD'. Intrinsic: '_mm_mask_div_pd'. Requires AVX512F.
func MaskDivPs ¶
MaskDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VDIVPS'. Intrinsic: '_mm_mask_div_ps'. Requires AVX512F.
func MaskDivRoundSd ¶
func MaskDivRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
MaskDivRoundSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := a[63:0] / b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VDIVSD'. Intrinsic: '_mm_mask_div_round_sd'. Requires AVX512F.
func MaskDivRoundSs ¶
func MaskDivRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
MaskDivRoundSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := a[31:0] / b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VDIVSS'. Intrinsic: '_mm_mask_div_round_ss'. Requires AVX512F.
func MaskDivSd ¶
MaskDivSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := a[63:0] / b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VDIVSD'. Intrinsic: '_mm_mask_div_sd'. Requires AVX512F.
func MaskDivSs ¶
MaskDivSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := a[31:0] / b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VDIVSS'. Intrinsic: '_mm_mask_div_ss'. Requires AVX512F.
func MaskExpandEpi32 ¶
MaskExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPEXPANDD'. Intrinsic: '_mm_mask_expand_epi32'. Requires AVX512F.
func MaskExpandEpi64 ¶
MaskExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPEXPANDQ'. Intrinsic: '_mm_mask_expand_epi64'. Requires AVX512F.
func MaskExpandPd ¶
MaskExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXPANDPD'. Intrinsic: '_mm_mask_expand_pd'. Requires AVX512F.
func MaskExpandPs ¶
MaskExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXPANDPS'. Intrinsic: '_mm_mask_expand_ps'. Requires AVX512F.
func MaskFixupimmPd ¶
MaskFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm_mask_fixupimm_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskFixupimmPs ¶
MaskFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm_mask_fixupimm_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskFixupimmRoundSd ¶
func MaskFixupimmRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)
MaskFixupimmRoundSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } IF k[0] dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_mask_fixupimm_round_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskFixupimmRoundSs ¶
func MaskFixupimmRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)
MaskFixupimmRoundSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } IF k[0] dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_mask_fixupimm_round_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskFixupimmSd ¶
MaskFixupimmSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } IF k[0] dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_mask_fixupimm_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskFixupimmSs ¶
MaskFixupimmSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } IF k[0] dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_mask_fixupimm_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskFmaddPd ¶
MaskFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm_mask_fmadd_pd'. Requires AVX512F.
func MaskFmaddPs ¶
MaskFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm_mask_fmadd_ps'. Requires AVX512F.
func MaskFmaddRoundSd ¶
func MaskFmaddRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
MaskFmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask_fmadd_round_sd'. Requires AVX512F.
func MaskFmaddRoundSs ¶
func MaskFmaddRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
MaskFmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask_fmadd_round_ss'. Requires AVX512F.
func MaskFmaddSd ¶
MaskFmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask_fmadd_sd'. Requires AVX512F.
func MaskFmaddSs ¶
MaskFmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask_fmadd_ss'. Requires AVX512F.
func MaskFmaddsubPd ¶
MaskFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm_mask_fmaddsub_pd'. Requires AVX512F.
func MaskFmaddsubPs ¶
MaskFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm_mask_fmaddsub_ps'. Requires AVX512F.
func MaskFmsubPd ¶
MaskFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm_mask_fmsub_pd'. Requires AVX512F.
func MaskFmsubPs ¶
MaskFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm_mask_fmsub_ps'. Requires AVX512F.
func MaskFmsubRoundSd ¶
func MaskFmsubRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
MaskFmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask_fmsub_round_sd'. Requires AVX512F.
func MaskFmsubRoundSs ¶
func MaskFmsubRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
MaskFmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask_fmsub_round_ss'. Requires AVX512F.
func MaskFmsubSd ¶
MaskFmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask_fmsub_sd'. Requires AVX512F.
func MaskFmsubSs ¶
MaskFmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask_fmsub_ss'. Requires AVX512F.
func MaskFmsubaddPd ¶
MaskFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm_mask_fmsubadd_pd'. Requires AVX512F.
func MaskFmsubaddPs ¶
MaskFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm_mask_fmsubadd_ps'. Requires AVX512F.
func MaskFnmaddPd ¶
MaskFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm_mask_fnmadd_pd'. Requires AVX512F.
func MaskFnmaddPs ¶
MaskFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm_mask_fnmadd_ps'. Requires AVX512F.
func MaskFnmaddRoundSd ¶
func MaskFnmaddRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
MaskFnmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask_fnmadd_round_sd'. Requires AVX512F.
func MaskFnmaddRoundSs ¶
func MaskFnmaddRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
MaskFnmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask_fnmadd_round_ss'. Requires AVX512F.
func MaskFnmaddSd ¶
MaskFnmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask_fnmadd_sd'. Requires AVX512F.
func MaskFnmaddSs ¶
MaskFnmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask_fnmadd_ss'. Requires AVX512F.
func MaskFnmsubPd ¶
MaskFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm_mask_fnmsub_pd'. Requires AVX512F.
func MaskFnmsubPs ¶
MaskFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm_mask_fnmsub_ps'. Requires AVX512F.
func MaskFnmsubRoundSd ¶
func MaskFnmsubRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
MaskFnmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask_fnmsub_round_sd'. Requires AVX512F.
func MaskFnmsubRoundSs ¶
func MaskFnmsubRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
MaskFnmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask_fnmsub_round_ss'. Requires AVX512F.
func MaskFnmsubSd ¶
MaskFnmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := a[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask_fnmsub_sd'. Requires AVX512F.
func MaskFnmsubSs ¶
MaskFnmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := a[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask_fnmsub_ss'. Requires AVX512F.
func MaskGetexpPd ¶
MaskGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VGETEXPPD'. Intrinsic: '_mm_mask_getexp_pd'. Requires AVX512F.
func MaskGetexpPs ¶
MaskGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VGETEXPPS'. Intrinsic: '_mm_mask_getexp_ps'. Requires AVX512F.
func MaskGetexpRoundSd ¶
func MaskGetexpRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
MaskGetexpRoundSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VGETEXPSD'. Intrinsic: '_mm_mask_getexp_round_sd'. Requires AVX512F.
func MaskGetexpRoundSs ¶
func MaskGetexpRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
MaskGetexpRoundSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VGETEXPSS'. Intrinsic: '_mm_mask_getexp_round_ss'. Requires AVX512F.
func MaskGetexpSd ¶
MaskGetexpSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
IF k[0] dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VGETEXPSD'. Intrinsic: '_mm_mask_getexp_sd'. Requires AVX512F.
func MaskGetexpSs ¶
MaskGetexpSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
IF k[0] dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VGETEXPSS'. Intrinsic: '_mm_mask_getexp_ss'. Requires AVX512F.
func MaskGetmantPd ¶
func MaskGetmantPd(src x86.M128d, k x86.Mmask8, a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
MaskGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VGETMANTPD'. Intrinsic: '_mm_mask_getmant_pd'. Requires AVX512F.
func MaskGetmantPs ¶
func MaskGetmantPs(src x86.M128, k x86.Mmask8, a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
MaskGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VGETMANTPS'. Intrinsic: '_mm_mask_getmant_ps'. Requires AVX512F.
func MaskGetmantRoundSd ¶
func MaskGetmantRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128d)
MaskGetmantRoundSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of: (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VGETMANTSD'. Intrinsic: '_mm_mask_getmant_round_sd'. Requires AVX512F.
func MaskGetmantRoundSs ¶
func MaskGetmantRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128)
MaskGetmantRoundSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of: (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VGETMANTSS'. Intrinsic: '_mm_mask_getmant_round_ss'. Requires AVX512F.
func MaskGetmantSd ¶
func MaskGetmantSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
MaskGetmantSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 IF k[0] dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VGETMANTSD'. Intrinsic: '_mm_mask_getmant_sd'. Requires AVX512F.
func MaskGetmantSs ¶
func MaskGetmantSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
MaskGetmantSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 IF k[0] dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VGETMANTSS'. Intrinsic: '_mm_mask_getmant_ss'. Requires AVX512F.
func MaskLoadSd ¶
MaskLoadSd: Load a double-precision (64-bit) floating-point element from memory into the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and set the upper element of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.
IF k[0] dst[63:0] := MEM[mem_addr+63:mem_addr] ELSE dst[63:0] := src[63:0] FI dst[MAX:64] := 0
Instruction: 'VMOVSD'. Intrinsic: '_mm_mask_load_sd'. Requires AVX512F.
FIXME: Will likely need to be reworked (has pointer parameter).
func MaskLoadSs ¶
MaskLoadSs: Load a single-precision (32-bit) floating-point element from memory into the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and set the upper elements of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.
IF k[0] dst[31:0] := MEM[mem_addr+31:mem_addr] ELSE dst[31:0] := src[31:0] FI dst[MAX:32] := 0
Instruction: 'VMOVSS'. Intrinsic: '_mm_mask_load_ss'. Requires AVX512F.
FIXME: Will likely need to be reworked (has pointer parameter).
func MaskMaxEpi32 ¶
MaskMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMAXSD'. Intrinsic: '_mm_mask_max_epi32'. Requires AVX512F.
func MaskMaxEpi64 ¶
MaskMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMAXSQ'. Intrinsic: '_mm_mask_max_epi64'. Requires AVX512F.
func MaskMaxEpu32 ¶
MaskMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMAXUD'. Intrinsic: '_mm_mask_max_epu32'. Requires AVX512F.
func MaskMaxEpu64 ¶
MaskMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMAXUQ'. Intrinsic: '_mm_mask_max_epu64'. Requires AVX512F.
func MaskMaxPd ¶
MaskMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMAXPD'. Intrinsic: '_mm_mask_max_pd'. Requires AVX512F.
func MaskMaxPs ¶
MaskMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMAXPS'. Intrinsic: '_mm_mask_max_ps'. Requires AVX512F.
func MaskMaxRoundSd ¶
MaskMaxRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. IF k[0] dst[63:0] := MAX(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMAXSD'. Intrinsic: '_mm_mask_max_round_sd'. Requires AVX512F.
func MaskMaxRoundSs ¶
MaskMaxRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. IF k[0] dst[31:0] := MAX(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMAXSS'. Intrinsic: '_mm_mask_max_round_ss'. Requires AVX512F.
func MaskMaxSd ¶
MaskMaxSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := MAX(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMAXSD'. Intrinsic: '_mm_mask_max_sd'. Requires AVX512F.
func MaskMaxSs ¶
MaskMaxSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[31:0] := MAX(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMAXSS'. Intrinsic: '_mm_mask_max_ss'. Requires AVX512F.
func MaskMinEpi32 ¶
MaskMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMINSD'. Intrinsic: '_mm_mask_min_epi32'. Requires AVX512F.
func MaskMinEpi64 ¶
MaskMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMINSQ'. Intrinsic: '_mm_mask_min_epi64'. Requires AVX512F.
func MaskMinEpu32 ¶
MaskMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMINUD'. Intrinsic: '_mm_mask_min_epu32'. Requires AVX512F.
func MaskMinEpu64 ¶
MaskMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMINUQ'. Intrinsic: '_mm_mask_min_epu64'. Requires AVX512F.
func MaskMinPd ¶
MaskMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMINPD'. Intrinsic: '_mm_mask_min_pd'. Requires AVX512F.
func MaskMinPs ¶
MaskMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMINPS'. Intrinsic: '_mm_mask_min_ps'. Requires AVX512F.
func MaskMinRoundSd ¶
MaskMinRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. IF k[0] dst[63:0] := MIN(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMINSD'. Intrinsic: '_mm_mask_min_round_sd'. Requires AVX512F.
func MaskMinRoundSs ¶
MaskMinRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. IF k[0] dst[31:0] := MIN(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMINSS'. Intrinsic: '_mm_mask_min_round_ss'. Requires AVX512F.
func MaskMinSd ¶
MaskMinSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := MIN(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMINSD'. Intrinsic: '_mm_mask_min_sd'. Requires AVX512F.
func MaskMinSs ¶
MaskMinSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[31:0] := MIN(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMINSS'. Intrinsic: '_mm_mask_min_ss'. Requires AVX512F.
func MaskMovEpi32 ¶
MaskMovEpi32: Move packed 32-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVDQA32'. Intrinsic: '_mm_mask_mov_epi32'. Requires AVX512F.
func MaskMovEpi64 ¶
MaskMovEpi64: Move packed 64-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVDQA64'. Intrinsic: '_mm_mask_mov_epi64'. Requires AVX512F.
func MaskMovPd ¶
MaskMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVAPD'. Intrinsic: '_mm_mask_mov_pd'. Requires AVX512F.
func MaskMovPs ¶
MaskMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVAPS'. Intrinsic: '_mm_mask_mov_ps'. Requires AVX512F.
func MaskMoveSd ¶
MaskMoveSd: Move the lower double-precision (64-bit) floating-point element from 'b' to the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMOVSD'. Intrinsic: '_mm_mask_move_sd'. Requires AVX512F.
func MaskMoveSs ¶
MaskMoveSs: Move the lower single-precision (32-bit) floating-point element from 'b' to the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMOVSS'. Intrinsic: '_mm_mask_move_ss'. Requires AVX512F.
func MaskMovedupPd ¶
MaskMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVDDUP'. Intrinsic: '_mm_mask_movedup_pd'. Requires AVX512F.
func MaskMovehdupPs ¶
MaskMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVSHDUP'. Intrinsic: '_mm_mask_movehdup_ps'. Requires AVX512F.
func MaskMoveldupPs ¶
MaskMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVSLDUP'. Intrinsic: '_mm_mask_moveldup_ps'. Requires AVX512F.
func MaskMulEpi32 ¶
MaskMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMULDQ'. Intrinsic: '_mm_mask_mul_epi32'. Requires AVX512F.
func MaskMulEpu32 ¶
MaskMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMULUDQ'. Intrinsic: '_mm_mask_mul_epu32'. Requires AVX512F.
func MaskMulPd ¶
MaskMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMULPD'. Intrinsic: '_mm_mask_mul_pd'. Requires AVX512F.
func MaskMulPs ¶
MaskMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). RM.
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMULPS'. Intrinsic: '_mm_mask_mul_ps'. Requires AVX512F.
func MaskMulRoundSd ¶
func MaskMulRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
MaskMulRoundSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := a[63:0] * b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMULSD'. Intrinsic: '_mm_mask_mul_round_sd'. Requires AVX512F.
func MaskMulRoundSs ¶
func MaskMulRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
MaskMulRoundSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := a[31:0] * b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMULSS'. Intrinsic: '_mm_mask_mul_round_ss'. Requires AVX512F.
func MaskMulSd ¶
MaskMulSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := a[63:0] * b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMULSD'. Intrinsic: '_mm_mask_mul_sd'. Requires AVX512F.
func MaskMulSs ¶
MaskMulSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := a[31:0] * b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMULSS'. Intrinsic: '_mm_mask_mul_ss'. Requires AVX512F.
func MaskMulloEpi32 ¶
MaskMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMULLD'. Intrinsic: '_mm_mask_mullo_epi32'. Requires AVX512F.
func MaskOrEpi32 ¶
MaskOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPORD'. Intrinsic: '_mm_mask_or_epi32'. Requires AVX512F.
func MaskOrEpi64 ¶
MaskOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPORQ'. Intrinsic: '_mm_mask_or_epi64'. Requires AVX512F.
func MaskPermutePd ¶
MaskPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm_mask_permute_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskPermutePs ¶
MaskPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm_mask_permute_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskPermutevarPd ¶
MaskPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm_mask_permutevar_pd'. Requires AVX512F.
func MaskPermutevarPs ¶
MaskPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm_mask_permutevar_ps'. Requires AVX512F.
func MaskPermutex2varEpi32 ¶
MaskPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMT2D'. Intrinsic: '_mm_mask_permutex2var_epi32'. Requires AVX512F.
func MaskPermutex2varEpi64 ¶
MaskPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMT2Q'. Intrinsic: '_mm_mask_permutex2var_epi64'. Requires AVX512F.
func MaskPermutex2varPd ¶
MaskPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := a[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMT2PD'. Intrinsic: '_mm_mask_permutex2var_pd'. Requires AVX512F.
func MaskPermutex2varPs ¶
MaskPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := a[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMT2PS'. Intrinsic: '_mm_mask_permutex2var_ps'. Requires AVX512F.
func MaskRcp14Pd ¶
MaskRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRCP14PD'. Intrinsic: '_mm_mask_rcp14_pd'. Requires AVX512F.
func MaskRcp14Ps ¶
MaskRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRCP14PS'. Intrinsic: '_mm_mask_rcp14_ps'. Requires AVX512F.
func MaskRcp14Sd ¶
MaskRcp14Sd: Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.
IF k[0] dst[63:0] := APPROXIMATE(1.0/b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VRCP14SD'. Intrinsic: '_mm_mask_rcp14_sd'. Requires AVX512F.
func MaskRcp14Ss ¶
MaskRcp14Ss: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.
IF k[0] dst[31:0] := APPROXIMATE(1.0/b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VRCP14SS'. Intrinsic: '_mm_mask_rcp14_ss'. Requires AVX512F.
func MaskRolEpi32 ¶
MaskRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLD'. Intrinsic: '_mm_mask_rol_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskRolEpi64 ¶
MaskRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLQ'. Intrinsic: '_mm_mask_rol_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskRolvEpi32 ¶
MaskRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLVD'. Intrinsic: '_mm_mask_rolv_epi32'. Requires AVX512F.
func MaskRolvEpi64 ¶
MaskRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLVQ'. Intrinsic: '_mm_mask_rolv_epi64'. Requires AVX512F.
func MaskRorEpi32 ¶
MaskRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORD'. Intrinsic: '_mm_mask_ror_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskRorEpi64 ¶
MaskRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORQ'. Intrinsic: '_mm_mask_ror_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskRorvEpi32 ¶
MaskRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORVD'. Intrinsic: '_mm_mask_rorv_epi32'. Requires AVX512F.
func MaskRorvEpi64 ¶
MaskRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORVQ'. Intrinsic: '_mm_mask_rorv_epi64'. Requires AVX512F.
func MaskRoundscalePd ¶
MaskRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm_mask_roundscale_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskRoundscalePs ¶
MaskRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm_mask_roundscale_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskRoundscaleRoundSd ¶
func MaskRoundscaleRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
MaskRoundscaleRoundSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } IF k[0] dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_mask_roundscale_round_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskRoundscaleRoundSs ¶
func MaskRoundscaleRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
MaskRoundscaleRoundSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } IF k[0] dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_mask_roundscale_round_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskRoundscaleSd ¶
func MaskRoundscaleSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
MaskRoundscaleSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } IF k[0] dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_mask_roundscale_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskRoundscaleSs ¶
MaskRoundscaleSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } IF k[0] dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_mask_roundscale_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskRsqrt14Pd ¶
MaskRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRSQRT14PD'. Intrinsic: '_mm_mask_rsqrt14_pd'. Requires AVX512F.
func MaskRsqrt14Ps ¶
MaskRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRSQRT14PS'. Intrinsic: '_mm_mask_rsqrt14_ps'. Requires AVX512F.
func MaskRsqrt14Sd ¶
MaskRsqrt14Sd: Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.
IF k[0] dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0])) ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VRSQRT14SD'. Intrinsic: '_mm_mask_rsqrt14_sd'. Requires AVX512F.
func MaskRsqrt14Ss ¶
MaskRsqrt14Ss: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.
IF k[0] dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0])) ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VRSQRT14SS'. Intrinsic: '_mm_mask_rsqrt14_ss'. Requires AVX512F.
func MaskScalefPd ¶
MaskScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm_mask_scalef_pd'. Requires AVX512F.
func MaskScalefPs ¶
MaskScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm_mask_scalef_ps'. Requires AVX512F.
func MaskScalefRoundSd ¶
func MaskScalefRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
MaskScalefRoundSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } IF k[0] dst[63:0] := SCALE(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSCALEFSD'. Intrinsic: '_mm_mask_scalef_round_sd'. Requires AVX512F.
func MaskScalefRoundSs ¶
func MaskScalefRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
MaskScalefRoundSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } IF k[0] dst[31:0] := SCALE(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSCALEFSS'. Intrinsic: '_mm_mask_scalef_round_ss'. Requires AVX512F.
func MaskScalefSd ¶
MaskScalefSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } IF k[0] dst[63:0] := SCALE(a[63:0], b[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSCALEFSD'. Intrinsic: '_mm_mask_scalef_sd'. Requires AVX512F.
func MaskScalefSs ¶
MaskScalefSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } IF k[0] dst[31:0] := SCALE(a[31:0], b[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSCALEFSS'. Intrinsic: '_mm_mask_scalef_ss'. Requires AVX512F.
func MaskSet1Epi32 ¶
MaskSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_mask_set1_epi32'. Requires AVX512F.
func MaskSet1Epi64 ¶
MaskSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_mask_set1_epi64'. Requires AVX512F.
func MaskShuffleEpi32 ¶
MaskShuffleEpi32: Shuffle 32-bit integers in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSHUFD'. Intrinsic: '_mm_mask_shuffle_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskShufflePd ¶
func MaskShufflePd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
MaskShufflePd: Shuffle double-precision (64-bit) floating-point elements using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSHUFPD'. Intrinsic: '_mm_mask_shuffle_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskShufflePs ¶
MaskShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSHUFPS'. Intrinsic: '_mm_mask_shuffle_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskSllEpi32 ¶
MaskSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm_mask_sll_epi32'. Requires AVX512F.
func MaskSllEpi64 ¶
MaskSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm_mask_sll_epi64'. Requires AVX512F.
func MaskSlliEpi32 ¶
MaskSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm_mask_slli_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskSlliEpi64 ¶
MaskSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm_mask_slli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskSllvEpi32 ¶
MaskSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLVD'. Intrinsic: '_mm_mask_sllv_epi32'. Requires AVX512F.
func MaskSllvEpi64 ¶
MaskSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLVQ'. Intrinsic: '_mm_mask_sllv_epi64'. Requires AVX512F.
func MaskSqrtPd ¶
MaskSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSQRTPD'. Intrinsic: '_mm_mask_sqrt_pd'. Requires AVX512F.
func MaskSqrtPs ¶
MaskSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSQRTPS'. Intrinsic: '_mm_mask_sqrt_ps'. Requires AVX512F.
func MaskSqrtRoundSd ¶
func MaskSqrtRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
MaskSqrtRoundSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := SQRT(a[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSQRTSD'. Intrinsic: '_mm_mask_sqrt_round_sd'. Requires AVX512F.
func MaskSqrtRoundSs ¶
func MaskSqrtRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
MaskSqrtRoundSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := SQRT(a[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSQRTSS'. Intrinsic: '_mm_mask_sqrt_round_ss'. Requires AVX512F.
func MaskSqrtSd ¶
MaskSqrtSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
IF k[0] dst[63:0] := SQRT(a[63:0]) ELSE dst[63:0] := src[63:0] FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSQRTSD'. Intrinsic: '_mm_mask_sqrt_sd'. Requires AVX512F.
func MaskSqrtSs ¶
MaskSqrtSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
IF k[0] dst[31:0] := SQRT(a[31:0]) ELSE dst[31:0] := src[31:0] FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSQRTSS'. Intrinsic: '_mm_mask_sqrt_ss'. Requires AVX512F.
func MaskSraEpi32 ¶
MaskSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm_mask_sra_epi32'. Requires AVX512F.
func MaskSraEpi64 ¶
MaskSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm_mask_sra_epi64'. Requires AVX512F.
func MaskSraiEpi32 ¶
MaskSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm_mask_srai_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskSraiEpi64 ¶
MaskSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm_mask_srai_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskSravEpi32 ¶
MaskSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAVD'. Intrinsic: '_mm_mask_srav_epi32'. Requires AVX512F.
func MaskSravEpi64 ¶
MaskSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAVQ'. Intrinsic: '_mm_mask_srav_epi64'. Requires AVX512F.
func MaskSrlEpi32 ¶
MaskSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm_mask_srl_epi32'. Requires AVX512F.
func MaskSrlEpi64 ¶
MaskSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm_mask_srl_epi64'. Requires AVX512F.
func MaskSrliEpi32 ¶
MaskSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm_mask_srli_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskSrliEpi64 ¶
MaskSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm_mask_srli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskSrlvEpi32 ¶
MaskSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLVD'. Intrinsic: '_mm_mask_srlv_epi32'. Requires AVX512F.
func MaskSrlvEpi64 ¶
MaskSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLVQ'. Intrinsic: '_mm_mask_srlv_epi64'. Requires AVX512F.
func MaskStoreSd ¶
MaskStoreSd: Store the lower double-precision (64-bit) floating-point element from 'a' into memory using writemask 'k'.
'mem_addr' must be aligned on a 16-byte boundary or a general-protection
exception may be generated.
IF k[0] MEM[mem_addr+63:mem_addr] := a[63:0] FI
Instruction: 'VMOVSD'. Intrinsic: '_mm_mask_store_sd'. Requires AVX512F.
FIXME: Will likely need to be reworked (has pointer parameter).
func MaskStoreSs ¶
MaskStoreSs: Store the lower single-precision (32-bit) floating-point element from 'a' into memory using writemask 'k'.
'mem_addr' must be aligned on a 16-byte boundary or a general-protection
exception may be generated.
IF k[0] MEM[mem_addr+31:mem_addr] := a[31:0] FI
Instruction: 'VMOVSS'. Intrinsic: '_mm_mask_store_ss'. Requires AVX512F.
FIXME: Will likely need to be reworked (has pointer parameter).
func MaskSubEpi32 ¶
MaskSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSUBD'. Intrinsic: '_mm_mask_sub_epi32'. Requires AVX512F.
func MaskSubEpi64 ¶
MaskSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSUBQ'. Intrinsic: '_mm_mask_sub_epi64'. Requires AVX512F.
func MaskSubPd ¶
MaskSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSUBPD'. Intrinsic: '_mm_mask_sub_pd'. Requires AVX512F.
func MaskSubPs ¶
MaskSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSUBPS'. Intrinsic: '_mm_mask_sub_ps'. Requires AVX512F.
func MaskSubRoundSd ¶
func MaskSubRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
MaskSubRoundSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := a[63:0] - b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VSUBSD'. Intrinsic: '_mm_mask_sub_round_sd'. Requires AVX512F.
func MaskSubRoundSs ¶
func MaskSubRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
MaskSubRoundSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := a[31:0] - b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VSUBSS'. Intrinsic: '_mm_mask_sub_round_ss'. Requires AVX512F.
func MaskSubSd ¶
MaskSubSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := a[63:0] - b[63:0] ELSE dst[63:0] := src[63:0] FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VSUBSD'. Intrinsic: '_mm_mask_sub_sd'. Requires AVX512F.
func MaskSubSs ¶
MaskSubSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := a[31:0] - b[31:0] ELSE dst[31:0] := src[31:0] FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VSUBSS'. Intrinsic: '_mm_mask_sub_ss'. Requires AVX512F.
func MaskTernarylogicEpi32 ¶
func MaskTernarylogicEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)
MaskTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 32-bit granularity (32-bit elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPTERNLOGD'. Intrinsic: '_mm_mask_ternarylogic_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskTernarylogicEpi64 ¶
func MaskTernarylogicEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)
MaskTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 64-bit granularity (64-bit elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm_mask_ternarylogic_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskTestEpi32Mask ¶
MaskTestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPTESTMD'. Intrinsic: '_mm_mask_test_epi32_mask'. Requires AVX512F.
func MaskTestEpi64Mask ¶
MaskTestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPTESTMQ'. Intrinsic: '_mm_mask_test_epi64_mask'. Requires AVX512F.
func MaskTestnEpi32Mask ¶
MaskTestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.
FOR j := 0 to 3 i := j*32 IF k1[j] k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:4] := 0
Instruction: 'VPTESTNMD'. Intrinsic: '_mm_mask_testn_epi32_mask'. Requires AVX512F.
func MaskTestnEpi64Mask ¶
MaskTestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.
FOR j := 0 to 1 i := j*64 IF k1[j] k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ELSE k[j] := 0 FI ENDFOR k[MAX:2] := 0
Instruction: 'VPTESTNMQ'. Intrinsic: '_mm_mask_testn_epi64_mask'. Requires AVX512F.
func MaskUnpackhiEpi32 ¶
MaskUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm_mask_unpackhi_epi32'. Requires AVX512F.
func MaskUnpackhiEpi64 ¶
MaskUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm_mask_unpackhi_epi64'. Requires AVX512F.
func MaskUnpackhiPd ¶
MaskUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VUNPCKHPD'. Intrinsic: '_mm_mask_unpackhi_pd'. Requires AVX512F.
func MaskUnpackhiPs ¶
MaskUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VUNPCKHPS'. Intrinsic: '_mm_mask_unpackhi_ps'. Requires AVX512F.
func MaskUnpackloEpi32 ¶
MaskUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm_mask_unpacklo_epi32'. Requires AVX512F.
func MaskUnpackloEpi64 ¶
MaskUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm_mask_unpacklo_epi64'. Requires AVX512F.
func MaskUnpackloPd ¶
MaskUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VUNPCKLPD'. Intrinsic: '_mm_mask_unpacklo_pd'. Requires AVX512F.
func MaskUnpackloPs ¶
MaskUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VUNPCKLPS'. Intrinsic: '_mm_mask_unpacklo_ps'. Requires AVX512F.
func MaskXorEpi32 ¶
MaskXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := src[i+31:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPXORD'. Intrinsic: '_mm_mask_xor_epi32'. Requires AVX512F.
func MaskXorEpi64 ¶
MaskXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := src[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPXORQ'. Intrinsic: '_mm_mask_xor_epi64'. Requires AVX512F.
func MaskzAbsEpi32 ¶
MaskzAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ABS(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPABSD'. Intrinsic: '_mm_maskz_abs_epi32'. Requires AVX512F.
func MaskzAbsEpi64 ¶
MaskzAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ABS(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPABSQ'. Intrinsic: '_mm_maskz_abs_epi64'. Requires AVX512F.
func MaskzAddEpi32 ¶
MaskzAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] + b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPADDD'. Intrinsic: '_mm_maskz_add_epi32'. Requires AVX512F.
func MaskzAddEpi64 ¶
MaskzAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] + b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPADDQ'. Intrinsic: '_mm_maskz_add_epi64'. Requires AVX512F.
func MaskzAddRoundSd ¶
MaskzAddRoundSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := a[63:0] + b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VADDSD'. Intrinsic: '_mm_maskz_add_round_sd'. Requires AVX512F.
func MaskzAddRoundSs ¶
MaskzAddRoundSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := a[31:0] + b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VADDSS'. Intrinsic: '_mm_maskz_add_round_ss'. Requires AVX512F.
func MaskzAddSd ¶
MaskzAddSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := a[63:0] + b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VADDSD'. Intrinsic: '_mm_maskz_add_sd'. Requires AVX512F.
func MaskzAddSs ¶
MaskzAddSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := a[31:0] + b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VADDSS'. Intrinsic: '_mm_maskz_add_ss'. Requires AVX512F.
func MaskzAndEpi32 ¶
MaskzAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPANDD'. Intrinsic: '_mm_maskz_and_epi32'. Requires AVX512F.
func MaskzAndEpi64 ¶
MaskzAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPANDQ'. Intrinsic: '_mm_maskz_and_epi64'. Requires AVX512F.
func MaskzAndnotEpi32 ¶
MaskzAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPANDND'. Intrinsic: '_mm_maskz_andnot_epi32'. Requires AVX512F.
func MaskzAndnotEpi64 ¶
MaskzAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPANDNQ'. Intrinsic: '_mm_maskz_andnot_epi64'. Requires AVX512F.
func MaskzBroadcastdEpi32 ¶
MaskzBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_maskz_broadcastd_epi32'. Requires AVX512F.
func MaskzBroadcastqEpi64 ¶
MaskzBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_maskz_broadcastq_epi64'. Requires AVX512F.
func MaskzBroadcastssPs ¶
MaskzBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VBROADCASTSS'. Intrinsic: '_mm_maskz_broadcastss_ps'. Requires AVX512F.
func MaskzCompressEpi32 ¶
MaskzCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 32 m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[127:m] := 0 dst[MAX:128] := 0
Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm_maskz_compress_epi32'. Requires AVX512F.
func MaskzCompressEpi64 ¶
MaskzCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 64 m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[127:m] := 0 dst[MAX:128] := 0
Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm_maskz_compress_epi64'. Requires AVX512F.
func MaskzCompressPd ¶
MaskzCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 64 m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[m+size-1:m] := a[i+63:i] m := m + size FI ENDFOR dst[127:m] := 0 dst[MAX:128] := 0
Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm_maskz_compress_pd'. Requires AVX512F.
func MaskzCompressPs ¶
MaskzCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.
size := 32 m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[m+size-1:m] := a[i+31:i] m := m + size FI ENDFOR dst[127:m] := 0 dst[MAX:128] := 0
Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm_maskz_compress_ps'. Requires AVX512F.
func MaskzCvtRoundpsPh ¶
MaskzCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 3 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_maskz_cvt_roundps_ph'. Requires AVX512F.
func MaskzCvtRoundsdSs ¶
MaskzCvtRoundsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := Convert_FP64_To_FP32(b[63:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:31] dst[MAX:64] := 0
Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_maskz_cvt_roundsd_ss'. Requires AVX512F.
func MaskzCvtRoundssSd ¶
MaskzCvtRoundssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:64] := 0
Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_maskz_cvt_roundss_sd'. Requires AVX512F.
func MaskzCvtepi16Epi32 ¶
MaskzCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[i+31:i] := SignExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSXWD'. Intrinsic: '_mm_maskz_cvtepi16_epi32'. Requires AVX512F.
func MaskzCvtepi16Epi64 ¶
MaskzCvtepi16Epi64: Sign extend packed 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[i+63:i] := SignExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm_maskz_cvtepi16_epi64'. Requires AVX512F.
func MaskzCvtepi32Epi16 ¶
MaskzCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVDW'. Intrinsic: '_mm_maskz_cvtepi32_epi16'. Requires AVX512F.
func MaskzCvtepi32Epi64 ¶
MaskzCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[i+63:i] := SignExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm_maskz_cvtepi32_epi64'. Requires AVX512F.
func MaskzCvtepi32Epi8 ¶
MaskzCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVDB'. Intrinsic: '_mm_maskz_cvtepi32_epi8'. Requires AVX512F.
func MaskzCvtepi32Pd ¶
MaskzCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*32 m := j*64 IF k[j] dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) ELSE dst[m+63:m] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm_maskz_cvtepi32_pd'. Requires AVX512F.
func MaskzCvtepi32Ps ¶
MaskzCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm_maskz_cvtepi32_ps'. Requires AVX512F.
func MaskzCvtepi64Epi16 ¶
MaskzCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVQW'. Intrinsic: '_mm_maskz_cvtepi64_epi16'. Requires AVX512F.
func MaskzCvtepi64Epi32 ¶
MaskzCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVQD'. Intrinsic: '_mm_maskz_cvtepi64_epi32'. Requires AVX512F.
func MaskzCvtepi64Epi8 ¶
MaskzCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVQB'. Intrinsic: '_mm_maskz_cvtepi64_epi8'. Requires AVX512F.
func MaskzCvtepi8Epi32 ¶
MaskzCvtepi8Epi32: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[i+31:i] := SignExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSXBD'. Intrinsic: '_mm_maskz_cvtepi8_epi32'. Requires AVX512F.
func MaskzCvtepi8Epi64 ¶
MaskzCvtepi8Epi64: Sign extend packed 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[i+63:i] := SignExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm_maskz_cvtepi8_epi64'. Requires AVX512F.
func MaskzCvtepu16Epi32 ¶
MaskzCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVZXWD'. Intrinsic: '_mm_maskz_cvtepu16_epi32'. Requires AVX512F.
func MaskzCvtepu16Epi64 ¶
MaskzCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+15:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm_maskz_cvtepu16_epi64'. Requires AVX512F.
func MaskzCvtepu32Epi64 ¶
MaskzCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm_maskz_cvtepu32_epi64'. Requires AVX512F.
func MaskzCvtepu32Pd ¶
MaskzCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 l := j*32 IF k[j] dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm_maskz_cvtepu32_pd'. Requires AVX512F.
func MaskzCvtepu8Epi32 ¶
MaskzCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in th elow 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[i+31:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVZXBD'. Intrinsic: '_mm_maskz_cvtepu8_epi32'. Requires AVX512F.
func MaskzCvtepu8Epi64 ¶
MaskzCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[i+63:i] := ZeroExtend(a[l+7:l]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm_maskz_cvtepu8_epi64'. Requires AVX512F.
func MaskzCvtpdEpi32 ¶
MaskzCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm_maskz_cvtpd_epi32'. Requires AVX512F.
func MaskzCvtpdEpu32 ¶
MaskzCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm_maskz_cvtpd_epu32'. Requires AVX512F.
func MaskzCvtpdPs ¶
MaskzCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*32 l := j*64 IF k[j] dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPD2PS'. Intrinsic: '_mm_maskz_cvtpd_ps'. Requires AVX512F.
func MaskzCvtphPs ¶
MaskzCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 m := j*16 IF k[j] dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPH2PS'. Intrinsic: '_mm_maskz_cvtph_ps'. Requires AVX512F.
func MaskzCvtpsEpi32 ¶
MaskzCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm_maskz_cvtps_epi32'. Requires AVX512F.
func MaskzCvtpsEpu32 ¶
MaskzCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm_maskz_cvtps_epu32'. Requires AVX512F.
func MaskzCvtpsPh ¶
MaskzCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE FOR j := 0 to 3 i := 16*j l := 32*j IF k[j] dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l]) ELSE dst[i+15:i] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_maskz_cvtps_ph'. Requires AVX512F.
func MaskzCvtsdSs ¶
MaskzCvtsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := Convert_FP64_To_FP32(b[63:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:31] dst[MAX:64] := 0
Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_maskz_cvtsd_ss'. Requires AVX512F.
func MaskzCvtsepi32Epi16 ¶
MaskzCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSDW'. Intrinsic: '_mm_maskz_cvtsepi32_epi16'. Requires AVX512F.
func MaskzCvtsepi32Epi8 ¶
MaskzCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVSDB'. Intrinsic: '_mm_maskz_cvtsepi32_epi8'. Requires AVX512F.
func MaskzCvtsepi64Epi16 ¶
MaskzCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVSQW'. Intrinsic: '_mm_maskz_cvtsepi64_epi16'. Requires AVX512F.
func MaskzCvtsepi64Epi32 ¶
MaskzCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVSQD'. Intrinsic: '_mm_maskz_cvtsepi64_epi32'. Requires AVX512F.
func MaskzCvtsepi64Epi8 ¶
MaskzCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:16] := 0
Instruction: 'VPMOVSQB'. Intrinsic: '_mm_maskz_cvtsepi64_epi8'. Requires AVX512F.
func MaskzCvtssSd ¶
MaskzCvtssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := Convert_FP32_To_FP64(b[31:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:64] := 0
Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_maskz_cvtss_sd'. Requires AVX512F.
func MaskzCvttpdEpi32 ¶
MaskzCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm_maskz_cvttpd_epi32'. Requires AVX512F.
func MaskzCvttpdEpu32 ¶
MaskzCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 32*j l := 64*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm_maskz_cvttpd_epu32'. Requires AVX512F.
func MaskzCvttpsEpi32 ¶
MaskzCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*i IF k[j] dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm_maskz_cvttps_epi32'. Requires AVX512F.
func MaskzCvttpsEpu32 ¶
MaskzCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm_maskz_cvttps_epu32'. Requires AVX512F.
func MaskzCvtusepi32Epi16 ¶
MaskzCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSDW'. Intrinsic: '_mm_maskz_cvtusepi32_epi16'. Requires AVX512F.
func MaskzCvtusepi32Epi8 ¶
MaskzCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVUSDB'. Intrinsic: '_mm_maskz_cvtusepi32_epi8'. Requires AVX512F.
func MaskzCvtusepi64Epi16 ¶
MaskzCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 16*j IF k[j] dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i]) ELSE dst[l+15:l] := 0 FI ENDFOR dst[MAX:32] := 0
Instruction: 'VPMOVUSQW'. Intrinsic: '_mm_maskz_cvtusepi64_epi16'. Requires AVX512F.
func MaskzCvtusepi64Epi32 ¶
MaskzCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 32*j IF k[j] dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i]) ELSE dst[l+31:l] := 0 FI ENDFOR dst[MAX:64] := 0
Instruction: 'VPMOVUSQD'. Intrinsic: '_mm_maskz_cvtusepi64_epi32'. Requires AVX512F.
func MaskzCvtusepi64Epi8 ¶
MaskzCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j l := 8*j IF k[j] dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i]) ELSE dst[l+7:l] := 0 FI ENDFOR dst[MAX:16] := 0
Instruction: 'VPMOVUSQB'. Intrinsic: '_mm_maskz_cvtusepi64_epi8'. Requires AVX512F.
func MaskzDivPd ¶
MaskzDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := 64*j IF k[j] dst[i+63:i] := a[i+63:i] / b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VDIVPD'. Intrinsic: '_mm_maskz_div_pd'. Requires AVX512F.
func MaskzDivPs ¶
MaskzDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := 32*j IF k[j] dst[i+31:i] := a[i+31:i] / b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VDIVPS'. Intrinsic: '_mm_maskz_div_ps'. Requires AVX512F.
func MaskzDivRoundSd ¶
MaskzDivRoundSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := a[63:0] / b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VDIVSD'. Intrinsic: '_mm_maskz_div_round_sd'. Requires AVX512F.
func MaskzDivRoundSs ¶
MaskzDivRoundSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := a[31:0] / b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VDIVSS'. Intrinsic: '_mm_maskz_div_round_ss'. Requires AVX512F.
func MaskzDivSd ¶
MaskzDivSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := a[63:0] / b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VDIVSD'. Intrinsic: '_mm_maskz_div_sd'. Requires AVX512F.
func MaskzDivSs ¶
MaskzDivSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := a[31:0] / b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VDIVSS'. Intrinsic: '_mm_maskz_div_ss'. Requires AVX512F.
func MaskzExpandEpi32 ¶
MaskzExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPEXPANDD'. Intrinsic: '_mm_maskz_expand_epi32'. Requires AVX512F.
func MaskzExpandEpi64 ¶
MaskzExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPEXPANDQ'. Intrinsic: '_mm_maskz_expand_epi64'. Requires AVX512F.
func MaskzExpandPd ¶
MaskzExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[m+63:m] m := m + 64 ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXPANDPD'. Intrinsic: '_mm_maskz_expand_pd'. Requires AVX512F.
func MaskzExpandPs ¶
MaskzExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
m := 0 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[m+31:m] m := m + 32 ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VEXPANDPS'. Intrinsic: '_mm_maskz_expand_ps'. Requires AVX512F.
func MaskzFixupimmPd ¶
func MaskzFixupimmPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
MaskzFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm_maskz_fixupimm_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzFixupimmPs ¶
MaskzFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm_maskz_fixupimm_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzFixupimmRoundSd ¶
func MaskzFixupimmRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)
MaskzFixupimmRoundSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } IF k[0] dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_maskz_fixupimm_round_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzFixupimmRoundSs ¶
func MaskzFixupimmRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)
MaskzFixupimmRoundSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } IF k[0] dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_maskz_fixupimm_round_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzFixupimmSd ¶
func MaskzFixupimmSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
MaskzFixupimmSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN := 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){ tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] CASE(tsrc[63:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[63:0] := src1[63:0] 1 : dest[63:0] := tsrc[63:0] 2 : dest[63:0] := QNaN(tsrc[63:0]) 3 : dest[63:0] := QNAN_Indefinite 4 : dest[63:0] := -INF 5 : dest[63:0] := +INF 6 : dest[63:0] := tsrc.sign? –INF : +INF 7 : dest[63:0] := -0 8 : dest[63:0] := +0 9 : dest[63:0] := -1 10: dest[63:0] := +1 11: dest[63:0] := 1⁄2 12: dest[63:0] := 90.0 13: dest[63:0] := PI/2 14: dest[63:0] := MAX_FLOAT 15: dest[63:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[63:0] } IF k[0] dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_maskz_fixupimm_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzFixupimmSs ¶
MaskzFixupimmSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.
enum TOKEN_TYPE { QNAN_TOKEN := 0, SNAN_TOKEN L= 1, ZERO_VALUE_TOKEN := 2, ONE_VALUE_TOKEN := 3, NEG_INF_TOKEN := 4, POS_INF_TOKEN := 5, NEG_VALUE_TOKEN := 6, POS_VALUE_TOKEN := 7 } FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){ tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] CASE(tsrc[31:0] of TOKEN_TYPE) QNAN_TOKEN:j := 0 SNAN_TOKEN:j := 1 ZERO_VALUE_TOKEN: j := 2 ONE_VALUE_TOKEN: j := 3 NEG_INF_TOKEN: j := 4 POS_INF_TOKEN: j := 5 NEG_VALUE_TOKEN: j := 6 POS_VALUE_TOKEN: j := 7 ESAC token_response[3:0] := src3[3+4*j:4*j] CASE(token_response[3:0]) of 0 : dest[31:0] := src1[31:0] 1 : dest[31:0] := tsrc[31:0] 2 : dest[31:0] := QNaN(tsrc[31:0]) 3 : dest[31:0] := QNAN_Indefinite 4 : dest[31:0] := -INF 5 : dest[31:0] := +INF 6 : dest[31:0] := tsrc.sign? –INF : +INF 7 : dest[31:0] := -0 8 : dest[31:0] := +0 9 : dest[31:0] := -1 10: dest[31:0] := +1 11: dest[31:0] := 1⁄2 12: dest[31:0] := 90.0 13: dest[31:0] := PI/2 14: dest[31:0] := MAX_FLOAT 15: dest[31:0] := -MAX_FLOAT ESAC CASE(tsrc[31:0] of TOKEN_TYPE) ZERO_VALUE_TOKEN: if imm8[0] then set #ZE ZERO_VALUE_TOKEN: if imm8[1] then set #IE ONE_VALUE_TOKEN: if imm8[2] then set #ZE ONE_VALUE_TOKEN: if imm8[3] then set #IE SNAN_TOKEN: if imm8[4] then set #IE NEG_INF_TOKEN: if imm8[5] then set #IE NEG_VALUE_TOKEN: if imm8[6] then set #IE POS_INF_TOKEN: if imm8[7] then set #IE ESAC RETURN dest[31:0] } IF k[0] dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_maskz_fixupimm_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzFmaddPd ¶
MaskzFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm_maskz_fmadd_pd'. Requires AVX512F.
func MaskzFmaddPs ¶
MaskzFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm_maskz_fmadd_ps'. Requires AVX512F.
func MaskzFmaddRoundSd ¶
func MaskzFmaddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
MaskzFmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_maskz_fmadd_round_sd'. Requires AVX512F.
func MaskzFmaddRoundSs ¶
func MaskzFmaddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
MaskzFmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_maskz_fmadd_round_ss'. Requires AVX512F.
func MaskzFmaddSd ¶
MaskzFmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_maskz_fmadd_sd'. Requires AVX512F.
func MaskzFmaddSs ¶
MaskzFmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_maskz_fmadd_ss'. Requires AVX512F.
func MaskzFmaddsubPd ¶
MaskzFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm_maskz_fmaddsub_pd'. Requires AVX512F.
func MaskzFmaddsubPs ¶
MaskzFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm_maskz_fmaddsub_ps'. Requires AVX512F.
func MaskzFmsubPd ¶
MaskzFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm_maskz_fmsub_pd'. Requires AVX512F.
func MaskzFmsubPs ¶
MaskzFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm_maskz_fmsub_ps'. Requires AVX512F.
func MaskzFmsubRoundSd ¶
func MaskzFmsubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
MaskzFmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_maskz_fmsub_round_sd'. Requires AVX512F.
func MaskzFmsubRoundSs ¶
func MaskzFmsubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
MaskzFmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_maskz_fmsub_round_ss'. Requires AVX512F.
func MaskzFmsubSd ¶
MaskzFmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_maskz_fmsub_sd'. Requires AVX512F.
func MaskzFmsubSs ¶
MaskzFmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_maskz_fmsub_ss'. Requires AVX512F.
func MaskzFmsubaddPd ¶
MaskzFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF (j is even) dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm_maskz_fmsubadd_pd'. Requires AVX512F.
func MaskzFmsubaddPs ¶
MaskzFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF (j is even) dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm_maskz_fmsubadd_ps'. Requires AVX512F.
func MaskzFnmaddPd ¶
MaskzFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm_maskz_fnmadd_pd'. Requires AVX512F.
func MaskzFnmaddPs ¶
MaskzFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm_maskz_fnmadd_ps'. Requires AVX512F.
func MaskzFnmaddRoundSd ¶
func MaskzFnmaddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
MaskzFnmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_maskz_fnmadd_round_sd'. Requires AVX512F.
func MaskzFnmaddRoundSs ¶
func MaskzFnmaddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
MaskzFnmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_maskz_fnmadd_round_ss'. Requires AVX512F.
func MaskzFnmaddSd ¶
MaskzFnmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMADD213SD, VFNMADD231SD, VFNMADD132SD'. Intrinsic: '_mm_maskz_fnmadd_sd'. Requires AVX512F.
func MaskzFnmaddSs ¶
MaskzFnmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_maskz_fnmadd_ss'. Requires AVX512F.
func MaskzFnmsubPd ¶
MaskzFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm_maskz_fnmsub_pd'. Requires AVX512F.
func MaskzFnmsubPs ¶
MaskzFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm_maskz_fnmsub_ps'. Requires AVX512F.
func MaskzFnmsubRoundSd ¶
func MaskzFnmsubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
MaskzFnmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_maskz_fnmsub_round_sd'. Requires AVX512F.
func MaskzFnmsubRoundSs ¶
func MaskzFnmsubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
MaskzFnmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_maskz_fnmsub_round_ss'. Requires AVX512F.
func MaskzFnmsubSd ¶
MaskzFnmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_maskz_fnmsub_sd'. Requires AVX512F.
func MaskzFnmsubSs ¶
MaskzFnmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_maskz_fnmsub_ss'. Requires AVX512F.
func MaskzGetexpPd ¶
MaskzGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ConvertExpFP64(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VGETEXPPD'. Intrinsic: '_mm_maskz_getexp_pd'. Requires AVX512F.
func MaskzGetexpPs ¶
MaskzGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ConvertExpFP32(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VGETEXPPS'. Intrinsic: '_mm_maskz_getexp_ps'. Requires AVX512F.
func MaskzGetexpRoundSd ¶
MaskzGetexpRoundSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VGETEXPSD'. Intrinsic: '_mm_maskz_getexp_round_sd'. Requires AVX512F.
func MaskzGetexpRoundSs ¶
MaskzGetexpRoundSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VGETEXPSS'. Intrinsic: '_mm_maskz_getexp_round_ss'. Requires AVX512F.
func MaskzGetexpSd ¶
MaskzGetexpSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
IF k[0] dst[63:0] := ConvertExpFP64(b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VGETEXPSD'. Intrinsic: '_mm_maskz_getexp_sd'. Requires AVX512F.
func MaskzGetexpSs ¶
MaskzGetexpSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.
IF k[0] dst[31:0] := ConvertExpFP32(b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VGETEXPSS'. Intrinsic: '_mm_maskz_getexp_ss'. Requires AVX512F.
func MaskzGetmantPd ¶
func MaskzGetmantPd(k x86.Mmask8, a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
MaskzGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VGETMANTPD'. Intrinsic: '_mm_maskz_getmant_pd'. Requires AVX512F.
func MaskzGetmantPs ¶
func MaskzGetmantPs(k x86.Mmask8, a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
MaskzGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VGETMANTPS'. Intrinsic: '_mm_maskz_getmant_ps'. Requires AVX512F.
func MaskzGetmantRoundSd ¶
func MaskzGetmantRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128d)
MaskzGetmantRoundSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of: (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VGETMANTSD'. Intrinsic: '_mm_maskz_getmant_round_sd'. Requires AVX512F.
func MaskzGetmantRoundSs ¶
func MaskzGetmantRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128)
MaskzGetmantRoundSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of: (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VGETMANTSS'. Intrinsic: '_mm_maskz_getmant_round_ss'. Requires AVX512F.
func MaskzGetmantSd ¶
func MaskzGetmantSd(k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
MaskzGetmantSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 IF k[0] dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VGETMANTSD'. Intrinsic: '_mm_maskz_getmant_sd'. Requires AVX512F.
func MaskzGetmantSs ¶
func MaskzGetmantSs(k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
MaskzGetmantSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.
The mantissa is normalized to the interval specified by 'interv', which can
take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2) _MM_MANT_NORM_p5_2 // interval [0.5, 2) _MM_MANT_NORM_p5_1 // interval [0.5, 1) _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values: _MM_MANT_SIGN_src // sign = sign(src) _MM_MANT_SIGN_zero // sign = 0 _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 IF k[0] dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VGETMANTSS'. Intrinsic: '_mm_maskz_getmant_ss'. Requires AVX512F.
func MaskzLoadSd ¶
MaskzLoadSd: Load a double-precision (64-bit) floating-point element from memory into the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and set the upper element of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.
IF k[0] dst[63:0] := MEM[mem_addr+63:mem_addr] ELSE dst[63:0] := 0 FI dst[MAX:64] := 0
Instruction: 'VMOVSD'. Intrinsic: '_mm_maskz_load_sd'. Requires AVX512F.
FIXME: Will likely need to be reworked (has pointer parameter).
func MaskzLoadSs ¶
MaskzLoadSs: Load a single-precision (32-bit) floating-point element from memory into the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and set the upper elements of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.
IF k[0] dst[31:0] := MEM[mem_addr+31:mem_addr] ELSE dst[31:0] := 0 FI dst[MAX:32] := 0
Instruction: 'VMOVSS'. Intrinsic: '_mm_maskz_load_ss'. Requires AVX512F.
FIXME: Will likely need to be reworked (has pointer parameter).
func MaskzMaxEpi32 ¶
MaskzMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMAXSD'. Intrinsic: '_mm_maskz_max_epi32'. Requires AVX512F.
func MaskzMaxEpi64 ¶
MaskzMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMAXSQ'. Intrinsic: '_mm_maskz_max_epi64'. Requires AVX512F.
func MaskzMaxEpu32 ¶
MaskzMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] > b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMAXUD'. Intrinsic: '_mm_maskz_max_epu32'. Requires AVX512F.
func MaskzMaxEpu64 ¶
MaskzMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMAXUQ'. Intrinsic: '_mm_maskz_max_epu64'. Requires AVX512F.
func MaskzMaxPd ¶
MaskzMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMAXPD'. Intrinsic: '_mm_maskz_max_pd'. Requires AVX512F.
func MaskzMaxPs ¶
MaskzMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMAXPS'. Intrinsic: '_mm_maskz_max_ps'. Requires AVX512F.
func MaskzMaxRoundSd ¶
MaskzMaxRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. IF k[0] dst[63:0] := MAX(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMAXSD'. Intrinsic: '_mm_maskz_max_round_sd'. Requires AVX512F.
func MaskzMaxRoundSs ¶
MaskzMaxRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. IF k[0] dst[31:0] := MAX(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMAXSS'. Intrinsic: '_mm_maskz_max_round_ss'. Requires AVX512F.
func MaskzMaxSd ¶
MaskzMaxSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := MAX(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMAXSD'. Intrinsic: '_mm_maskz_max_sd'. Requires AVX512F.
func MaskzMaxSs ¶
MaskzMaxSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[31:0] := MAX(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMAXSS'. Intrinsic: '_mm_maskz_max_ss'. Requires AVX512F.
func MaskzMinEpi32 ¶
MaskzMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMINSD'. Intrinsic: '_mm_maskz_min_epi32'. Requires AVX512F.
func MaskzMinEpi64 ¶
MaskzMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMINSQ'. Intrinsic: '_mm_maskz_min_epi64'. Requires AVX512F.
func MaskzMinEpu32 ¶
MaskzMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF a[i+31:i] < b[i+31:i] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := b[i+31:i] FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMINUD'. Intrinsic: '_mm_maskz_min_epu32'. Requires AVX512F.
func MaskzMinEpu64 ¶
MaskzMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMINUQ'. Intrinsic: '_mm_maskz_min_epu64'. Requires AVX512F.
func MaskzMinPd ¶
MaskzMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMINPD'. Intrinsic: '_mm_maskz_min_pd'. Requires AVX512F.
func MaskzMinPs ¶
MaskzMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMINPS'. Intrinsic: '_mm_maskz_min_ps'. Requires AVX512F.
func MaskzMinRoundSd ¶
MaskzMinRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. IF k[0] dst[63:0] := MIN(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMINSD'. Intrinsic: '_mm_maskz_min_round_sd'. Requires AVX512F.
func MaskzMinRoundSs ¶
MaskzMinRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. IF k[0] dst[31:0] := MIN(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMINSS'. Intrinsic: '_mm_maskz_min_round_ss'. Requires AVX512F.
func MaskzMinSd ¶
MaskzMinSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := MIN(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMINSD'. Intrinsic: '_mm_maskz_min_sd'. Requires AVX512F.
func MaskzMinSs ¶
MaskzMinSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[31:0] := MIN(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMINSS'. Intrinsic: '_mm_maskz_min_ss'. Requires AVX512F.
func MaskzMovEpi32 ¶
MaskzMovEpi32: Move packed 32-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVDQA32'. Intrinsic: '_mm_maskz_mov_epi32'. Requires AVX512F.
func MaskzMovEpi64 ¶
MaskzMovEpi64: Move packed 64-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVDQA64'. Intrinsic: '_mm_maskz_mov_epi64'. Requires AVX512F.
func MaskzMovPd ¶
MaskzMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVAPD'. Intrinsic: '_mm_maskz_mov_pd'. Requires AVX512F.
func MaskzMovPs ¶
MaskzMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVAPS'. Intrinsic: '_mm_maskz_mov_ps'. Requires AVX512F.
func MaskzMoveSd ¶
MaskzMoveSd: Move the lower double-precision (64-bit) floating-point element from 'b' to the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMOVSD'. Intrinsic: '_mm_maskz_move_sd'. Requires AVX512F.
func MaskzMoveSs ¶
MaskzMoveSs: Move the lower single-precision (32-bit) floating-point element from 'b' to the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMOVSS'. Intrinsic: '_mm_maskz_move_ss'. Requires AVX512F.
func MaskzMovedupPd ¶
MaskzMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[63:0] := a[63:0] tmp[127:64] := a[63:0] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVDDUP'. Intrinsic: '_mm_maskz_movedup_pd'. Requires AVX512F.
func MaskzMovehdupPs ¶
MaskzMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[31:0] := a[63:32] tmp[63:32] := a[63:32] tmp[95:64] := a[127:96] tmp[127:96] := a[127:96] FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVSHDUP'. Intrinsic: '_mm_maskz_movehdup_ps'. Requires AVX512F.
func MaskzMoveldupPs ¶
MaskzMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp[31:0] := a[31:0] tmp[63:32] := a[31:0] tmp[95:64] := a[95:64] tmp[127:96] := a[95:64] FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMOVSLDUP'. Intrinsic: '_mm_maskz_moveldup_ps'. Requires AVX512F.
func MaskzMulEpi32 ¶
MaskzMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMULDQ'. Intrinsic: '_mm_maskz_mul_epi32'. Requires AVX512F.
func MaskzMulEpu32 ¶
MaskzMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMULUDQ'. Intrinsic: '_mm_maskz_mul_epu32'. Requires AVX512F.
func MaskzMulPd ¶
MaskzMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] * b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMULPD'. Intrinsic: '_mm_maskz_mul_pd'. Requires AVX512F.
func MaskzMulPs ¶
MaskzMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] * b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VMULPS'. Intrinsic: '_mm_maskz_mul_ps'. Requires AVX512F.
func MaskzMulRoundSd ¶
MaskzMulRoundSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := a[63:0] * b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMULSD'. Intrinsic: '_mm_maskz_mul_round_sd'. Requires AVX512F.
func MaskzMulRoundSs ¶
MaskzMulRoundSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := a[31:0] * b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMULSS'. Intrinsic: '_mm_maskz_mul_round_ss'. Requires AVX512F.
func MaskzMulSd ¶
MaskzMulSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := a[63:0] * b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMULSD'. Intrinsic: '_mm_maskz_mul_sd'. Requires AVX512F.
func MaskzMulSs ¶
MaskzMulSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := a[31:0] * b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMULSS'. Intrinsic: '_mm_maskz_mul_ss'. Requires AVX512F.
func MaskzMulloEpi32 ¶
MaskzMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] tmp[63:0] := a[i+31:i] * b[i+31:i] dst[i+31:i] := tmp[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMULLD'. Intrinsic: '_mm_maskz_mullo_epi32'. Requires AVX512F.
func MaskzOrEpi32 ¶
MaskzOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] OR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPORD'. Intrinsic: '_mm_maskz_or_epi32'. Requires AVX512F.
func MaskzOrEpi64 ¶
MaskzOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] OR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPORQ'. Intrinsic: '_mm_maskz_or_epi64'. Requires AVX512F.
func MaskzPermutePd ¶
MaskzPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0] IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64] IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0] IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm_maskz_permute_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzPermutePs ¶
MaskzPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm_maskz_permute_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzPermutevarPd ¶
MaskzPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
IF (b[1] == 0) tmp_dst[63:0] := a[63:0] IF (b[1] == 1) tmp_dst[63:0] := a[127:64] IF (b[65] == 0) tmp_dst[127:64] := a[63:0] IF (b[65] == 1) tmp_dst[127:64] := a[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMILPD'. Intrinsic: '_mm_maskz_permutevar_pd'. Requires AVX512F.
func MaskzPermutevarPs ¶
MaskzPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMILPS'. Intrinsic: '_mm_maskz_permutevar_ps'. Requires AVX512F.
func MaskzPermutex2varEpi32 ¶
MaskzPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm_maskz_permutex2var_epi32'. Requires AVX512F.
func MaskzPermutex2varEpi64 ¶
MaskzPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm_maskz_permutex2var_epi64'. Requires AVX512F.
func MaskzPermutex2varPd ¶
MaskzPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 off := idx[i]*64 IF k[j] dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm_maskz_permutex2var_pd'. Requires AVX512F.
func MaskzPermutex2varPs ¶
MaskzPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 IF k[j] dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm_maskz_permutex2var_ps'. Requires AVX512F.
func MaskzRcp14Pd ¶
MaskzRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRCP14PD'. Intrinsic: '_mm_maskz_rcp14_pd'. Requires AVX512F.
func MaskzRcp14Ps ¶
MaskzRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRCP14PS'. Intrinsic: '_mm_maskz_rcp14_ps'. Requires AVX512F.
func MaskzRcp14Sd ¶
MaskzRcp14Sd: Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.
IF k[0] dst[63:0] := APPROXIMATE(1.0/b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VRCP14SD'. Intrinsic: '_mm_maskz_rcp14_sd'. Requires AVX512F.
func MaskzRcp14Ss ¶
MaskzRcp14Ss: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.
IF k[0] dst[31:0] := APPROXIMATE(1.0/b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VRCP14SS'. Intrinsic: '_mm_maskz_rcp14_ss'. Requires AVX512F.
func MaskzRolEpi32 ¶
MaskzRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLD'. Intrinsic: '_mm_maskz_rol_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzRolEpi64 ¶
MaskzRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLQ'. Intrinsic: '_mm_maskz_rol_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzRolvEpi32 ¶
MaskzRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLVD'. Intrinsic: '_mm_maskz_rolv_epi32'. Requires AVX512F.
func MaskzRolvEpi64 ¶
MaskzRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLVQ'. Intrinsic: '_mm_maskz_rolv_epi64'. Requires AVX512F.
func MaskzRorEpi32 ¶
MaskzRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORD'. Intrinsic: '_mm_maskz_ror_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzRorEpi64 ¶
MaskzRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORQ'. Intrinsic: '_mm_maskz_ror_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzRorvEpi32 ¶
MaskzRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORVD'. Intrinsic: '_mm_maskz_rorv_epi32'. Requires AVX512F.
func MaskzRorvEpi64 ¶
MaskzRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORVQ'. Intrinsic: '_mm_maskz_rorv_epi64'. Requires AVX512F.
func MaskzRoundscalePd ¶
MaskzRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm_maskz_roundscale_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzRoundscalePs ¶
MaskzRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm_maskz_roundscale_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzRoundscaleRoundSd ¶
func MaskzRoundscaleRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
MaskzRoundscaleRoundSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } IF k[0] dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_maskz_roundscale_round_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzRoundscaleRoundSs ¶
func MaskzRoundscaleRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
MaskzRoundscaleRoundSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } IF k[0] dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_maskz_roundscale_round_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzRoundscaleSd ¶
MaskzRoundscaleSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } IF k[0] dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_maskz_roundscale_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzRoundscaleSs ¶
MaskzRoundscaleSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } IF k[0] dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_maskz_roundscale_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzRsqrt14Pd ¶
MaskzRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i])) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRSQRT14PD'. Intrinsic: '_mm_maskz_rsqrt14_pd'. Requires AVX512F.
func MaskzRsqrt14Ps ¶
MaskzRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i])) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VRSQRT14PS'. Intrinsic: '_mm_maskz_rsqrt14_ps'. Requires AVX512F.
func MaskzRsqrt14Sd ¶
MaskzRsqrt14Sd: Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.
IF k[0] dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0])) ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VRSQRT14SD'. Intrinsic: '_mm_maskz_rsqrt14_sd'. Requires AVX512F.
func MaskzRsqrt14Ss ¶
MaskzRsqrt14Ss: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.
IF k[0] dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0])) ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VRSQRT14SS'. Intrinsic: '_mm_maskz_rsqrt14_ss'. Requires AVX512F.
func MaskzScalefPd ¶
MaskzScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm_maskz_scalef_pd'. Requires AVX512F.
func MaskzScalefPs ¶
MaskzScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm_maskz_scalef_ps'. Requires AVX512F.
func MaskzScalefRoundSd ¶
MaskzScalefRoundSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } IF k[0] dst[63:0] := SCALE(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSCALEFSD'. Intrinsic: '_mm_maskz_scalef_round_sd'. Requires AVX512F.
func MaskzScalefRoundSs ¶
MaskzScalefRoundSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } IF k[0] dst[31:0] := SCALE(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSCALEFSS'. Intrinsic: '_mm_maskz_scalef_round_ss'. Requires AVX512F.
func MaskzScalefSd ¶
MaskzScalefSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } IF k[0] dst[63:0] := SCALE(a[63:0], b[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSCALEFSD'. Intrinsic: '_mm_maskz_scalef_sd'. Requires AVX512F.
func MaskzScalefSs ¶
MaskzScalefSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } IF k[0] dst[31:0] := SCALE(a[31:0], b[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSCALEFSS'. Intrinsic: '_mm_maskz_scalef_ss'. Requires AVX512F.
func MaskzSet1Epi32 ¶
MaskzSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[31:0] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_maskz_set1_epi32'. Requires AVX512F.
func MaskzSet1Epi64 ¶
MaskzSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[63:0] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_maskz_set1_epi64'. Requires AVX512F.
func MaskzShuffleEpi32 ¶
MaskzShuffleEpi32: Shuffle 32-bit integers in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSHUFD'. Intrinsic: '_mm_maskz_shuffle_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzShufflePd ¶
MaskzShufflePd: Shuffle double-precision (64-bit) floating-point elements using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSHUFPD'. Intrinsic: '_mm_maskz_shuffle_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzShufflePs ¶
MaskzShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
SELECT4(src, control){ CASE(control[1:0]) 0: tmp[31:0] := src[31:0] 1: tmp[31:0] := src[63:32] 2: tmp[31:0] := src[95:64] 3: tmp[31:0] := src[127:96] ESAC RETURN tmp[31:0] } tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSHUFPS'. Intrinsic: '_mm_maskz_shuffle_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzSllEpi32 ¶
MaskzSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm_maskz_sll_epi32'. Requires AVX512F.
func MaskzSllEpi64 ¶
MaskzSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm_maskz_sll_epi64'. Requires AVX512F.
func MaskzSlliEpi32 ¶
MaskzSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLD'. Intrinsic: '_mm_maskz_slli_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzSlliEpi64 ¶
MaskzSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLQ'. Intrinsic: '_mm_maskz_slli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzSllvEpi32 ¶
MaskzSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLVD'. Intrinsic: '_mm_maskz_sllv_epi32'. Requires AVX512F.
func MaskzSllvEpi64 ¶
MaskzSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSLLVQ'. Intrinsic: '_mm_maskz_sllv_epi64'. Requires AVX512F.
func MaskzSqrtPd ¶
MaskzSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SQRT(a[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSQRTPD'. Intrinsic: '_mm_maskz_sqrt_pd'. Requires AVX512F.
func MaskzSqrtPs ¶
MaskzSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SQRT(a[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSQRTPS'. Intrinsic: '_mm_maskz_sqrt_ps'. Requires AVX512F.
func MaskzSqrtRoundSd ¶
MaskzSqrtRoundSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := SQRT(a[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSQRTSD'. Intrinsic: '_mm_maskz_sqrt_round_sd'. Requires AVX512F.
func MaskzSqrtRoundSs ¶
MaskzSqrtRoundSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := SQRT(a[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSQRTSS'. Intrinsic: '_mm_maskz_sqrt_round_ss'. Requires AVX512F.
func MaskzSqrtSd ¶
MaskzSqrtSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.
IF k[0] dst[63:0] := SQRT(a[63:0]) ELSE dst[63:0] := 0 FI dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSQRTSD'. Intrinsic: '_mm_maskz_sqrt_sd'. Requires AVX512F.
func MaskzSqrtSs ¶
MaskzSqrtSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
IF k[0] dst[31:0] := SQRT(a[31:0]) ELSE dst[31:0] := 0 FI dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSQRTSS'. Intrinsic: '_mm_maskz_sqrt_ss'. Requires AVX512F.
func MaskzSraEpi32 ¶
MaskzSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm_maskz_sra_epi32'. Requires AVX512F.
func MaskzSraEpi64 ¶
MaskzSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm_maskz_sra_epi64'. Requires AVX512F.
func MaskzSraiEpi32 ¶
MaskzSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := SignBit ELSE dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAD'. Intrinsic: '_mm_maskz_srai_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzSraiEpi64 ¶
MaskzSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm_maskz_srai_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzSravEpi32 ¶
MaskzSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAVD'. Intrinsic: '_mm_maskz_srav_epi32'. Requires AVX512F.
func MaskzSravEpi64 ¶
MaskzSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAVQ'. Intrinsic: '_mm_maskz_srav_epi64'. Requires AVX512F.
func MaskzSrlEpi32 ¶
MaskzSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF count[63:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm_maskz_srl_epi32'. Requires AVX512F.
func MaskzSrlEpi64 ¶
MaskzSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF count[63:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm_maskz_srl_epi64'. Requires AVX512F.
func MaskzSrliEpi32 ¶
MaskzSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] IF imm8[7:0] > 31 dst[i+31:i] := 0 ELSE dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0]) FI ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLD'. Intrinsic: '_mm_maskz_srli_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzSrliEpi64 ¶
MaskzSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] IF imm8[7:0] > 63 dst[i+63:i] := 0 ELSE dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0]) FI ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLQ'. Intrinsic: '_mm_maskz_srli_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzSrlvEpi32 ¶
MaskzSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i]) ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLVD'. Intrinsic: '_mm_maskz_srlv_epi32'. Requires AVX512F.
func MaskzSrlvEpi64 ¶
MaskzSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i]) ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRLVQ'. Intrinsic: '_mm_maskz_srlv_epi64'. Requires AVX512F.
func MaskzSubEpi32 ¶
MaskzSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSUBD'. Intrinsic: '_mm_maskz_sub_epi32'. Requires AVX512F.
func MaskzSubEpi64 ¶
MaskzSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSUBQ'. Intrinsic: '_mm_maskz_sub_epi64'. Requires AVX512F.
func MaskzSubPd ¶
MaskzSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] - b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSUBPD'. Intrinsic: '_mm_maskz_sub_pd'. Requires AVX512F.
func MaskzSubPs ¶
MaskzSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] - b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VSUBPS'. Intrinsic: '_mm_maskz_sub_ps'. Requires AVX512F.
func MaskzSubRoundSd ¶
MaskzSubRoundSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[63:0] := a[63:0] - b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VSUBSD'. Intrinsic: '_mm_maskz_sub_round_sd'. Requires AVX512F.
func MaskzSubRoundSs ¶
MaskzSubRoundSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE IF k[0] dst[31:0] := a[31:0] - b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VSUBSS'. Intrinsic: '_mm_maskz_sub_round_ss'. Requires AVX512F.
func MaskzSubSd ¶
MaskzSubSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.
IF k[0] dst[63:0] := a[63:0] - b[63:0] ELSE dst[63:0] := 0 FI dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VSUBSD'. Intrinsic: '_mm_maskz_sub_sd'. Requires AVX512F.
func MaskzSubSs ¶
MaskzSubSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
IF k[0] dst[31:0] := a[31:0] - b[31:0] ELSE dst[31:0] := 0 FI dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VSUBSS'. Intrinsic: '_mm_maskz_sub_ss'. Requires AVX512F.
func MaskzTernarylogicEpi32 ¶
func MaskzTernarylogicEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)
MaskzTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPTERNLOGD'. Intrinsic: '_mm_maskz_ternarylogic_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzTernarylogicEpi64 ¶
func MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)
MaskzTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm_maskz_ternarylogic_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func MaskzUnpackhiEpi32 ¶
MaskzUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm_maskz_unpackhi_epi32'. Requires AVX512F.
func MaskzUnpackhiEpi64 ¶
MaskzUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm_maskz_unpackhi_epi64'. Requires AVX512F.
func MaskzUnpackhiPd ¶
MaskzUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[127:64] dst[127:64] := src2[127:64] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VUNPCKHPD'. Intrinsic: '_mm_maskz_unpackhi_pd'. Requires AVX512F.
func MaskzUnpackhiPs ¶
MaskzUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[95:64] dst[63:32] := src2[95:64] dst[95:64] := src1[127:96] dst[127:96] := src2[127:96] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VUNPCKHPS'. Intrinsic: '_mm_maskz_unpackhi_ps'. Requires AVX512F.
func MaskzUnpackloEpi32 ¶
MaskzUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm_maskz_unpacklo_epi32'. Requires AVX512F.
func MaskzUnpackloEpi64 ¶
MaskzUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm_maskz_unpacklo_epi64'. Requires AVX512F.
func MaskzUnpackloPd ¶
MaskzUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){ dst[63:0] := src1[63:0] dst[127:64] := src2[63:0] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := tmp_dst[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VUNPCKLPD'. Intrinsic: '_mm_maskz_unpacklo_pd'. Requires AVX512F.
func MaskzUnpackloPs ¶
MaskzUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){ dst[31:0] := src1[31:0] dst[63:32] := src2[31:0] dst[95:64] := src1[63:32] dst[127:96] := src2[63:32] RETURN dst[127:0] } tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := tmp_dst[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VUNPCKLPS'. Intrinsic: '_mm_maskz_unpacklo_ps'. Requires AVX512F.
func MaskzXorEpi32 ¶
MaskzXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 3 i := j*32 IF k[j] dst[i+31:i] := a[i+31:i] XOR b[i+31:i] ELSE dst[i+31:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPXORD'. Intrinsic: '_mm_maskz_xor_epi32'. Requires AVX512F.
func MaskzXorEpi64 ¶
MaskzXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).
FOR j := 0 to 1 i := j*64 IF k[j] dst[i+63:i] := a[i+63:i] XOR b[i+63:i] ELSE dst[i+63:i] := 0 FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPXORQ'. Intrinsic: '_mm_maskz_xor_epi64'. Requires AVX512F.
func MaxEpi64 ¶
MaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.
FOR j := 0 to 1 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMAXSQ'. Intrinsic: '_mm_max_epi64'. Requires AVX512F.
func MaxEpu64 ¶
MaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.
FOR j := 0 to 1 i := j*64 IF a[i+63:i] > b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMAXUQ'. Intrinsic: '_mm_max_epu64'. Requires AVX512F.
func MaxRoundSd ¶
MaxRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. dst[63:0] := MAX(a[63:0], b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMAXSD'. Intrinsic: '_mm_max_round_sd'. Requires AVX512F.
func MaxRoundSs ¶
MaxRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. dst[31:0] := MAX(a[31:0], b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMAXSS'. Intrinsic: '_mm_max_round_ss'. Requires AVX512F.
func MinEpi64 ¶
MinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.
FOR j := 0 to 1 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMINSQ'. Intrinsic: '_mm_min_epi64'. Requires AVX512F.
func MinEpu64 ¶
MinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.
FOR j := 0 to 1 i := j*64 IF a[i+63:i] < b[i+63:i] dst[i+63:i] := a[i+63:i] ELSE dst[i+63:i] := b[i+63:i] FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPMINUQ'. Intrinsic: '_mm_min_epu64'. Requires AVX512F.
func MinRoundSd ¶
MinRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' , and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. dst[63:0] := MIN(a[63:0], b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMINSD'. Intrinsic: '_mm_min_round_sd'. Requires AVX512F.
func MinRoundSs ¶
MinRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions. dst[31:0] := MIN(a[31:0], b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMINSS'. Intrinsic: '_mm_min_round_ss'. Requires AVX512F.
func MulRoundSd ¶
MulRoundSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := a[63:0] * b[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VMULSD'. Intrinsic: '_mm_mul_round_sd'. Requires AVX512F.
func MulRoundSs ¶
MulRoundSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := a[31:0] * b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VMULSS'. Intrinsic: '_mm_mul_round_ss'. Requires AVX512F.
func Permutex2varEpi32 ¶
Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 3 i := j*32 off := idx[i+2:i]*32 dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm_permutex2var_epi32'. Requires AVX512F.
func Permutex2varEpi64 ¶
Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 1 i := j*64 off := idx[i]*64 dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm_permutex2var_epi64'. Requires AVX512F.
func Permutex2varPd ¶
Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 1 i := j*64 off := idx[i]*64 dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm_permutex2var_pd'. Requires AVX512F.
func Permutex2varPs ¶
Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.
FOR j := 0 to 3 i := j*32 off := idx[i+1:i]*32 dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] ENDFOR dst[MAX:128] := 0
Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm_permutex2var_ps'. Requires AVX512F.
func Rcp14Pd ¶
Rcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 1 i := j*64 dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VRCP14PD'. Intrinsic: '_mm_rcp14_pd'. Requires AVX512F.
func Rcp14Ps ¶
Rcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.
FOR j := 0 to 3 i := j*32 dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VRCP14PS'. Intrinsic: '_mm_rcp14_ps'. Requires AVX512F.
func Rcp14Sd ¶
Rcp14Sd: Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.
dst[63:0] := APPROXIMATE(1.0/b[63:0]) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VRCP14SD'. Intrinsic: '_mm_rcp14_sd'. Requires AVX512F.
func Rcp14Ss ¶
Rcp14Ss: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.
dst[31:0] := APPROXIMATE(1.0/b[31:0]) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VRCP14SS'. Intrinsic: '_mm_rcp14_ss'. Requires AVX512F.
func RolEpi32 ¶
RolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLD'. Intrinsic: '_mm_rol_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func RolEpi64 ¶
RolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLQ'. Intrinsic: '_mm_rol_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func RolvEpi32 ¶
RolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
LEFT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src << count) OR (src >> (32 - count)) } FOR j := 0 to 3 i := j*32 dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLVD'. Intrinsic: '_mm_rolv_epi32'. Requires AVX512F.
func RolvEpi64 ¶
RolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
LEFT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src << count) OR (src >> (64 - count)) } FOR j := 0 to 1 i := j*64 dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPROLVQ'. Intrinsic: '_mm_rolv_epi64'. Requires AVX512F.
func RorEpi32 ¶
RorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORD'. Intrinsic: '_mm_ror_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func RorEpi64 ¶
RorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORQ'. Intrinsic: '_mm_ror_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func RorvEpi32 ¶
RorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
RIGHT_ROTATE_DWORDS(src, count_src){ count := count_src modulo 32 RETURN (src >>count) OR (src << (32 - count)) } FOR j := 0 to 3 i := j*32 dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORVD'. Intrinsic: '_mm_rorv_epi32'. Requires AVX512F.
func RorvEpi64 ¶
RorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.
RIGHT_ROTATE_QWORDS(src, count_src){ count := count_src modulo 64 RETURN (src >> count) OR (src << (64 - count)) } FOR j := 0 to 1 i := j*64 dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPRORVQ'. Intrinsic: '_mm_rorv_epi64'. Requires AVX512F.
func RoundscalePd ¶
RoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm_roundscale_pd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func RoundscalePs ¶
RoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0]) ENDFOR dst[MAX:128] := 0
Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm_roundscale_ps'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func RoundscaleRoundSd ¶
RoundscaleRoundSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_roundscale_round_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func RoundscaleRoundSs ¶
RoundscaleRoundSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_roundscale_round_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func RoundscaleSd ¶
RoundscaleSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.
RoundTo_IntegerPD(src[63:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0]) 1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0]) 2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0]) 3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0]) ESAC dst[63:0] := 2^-M * tmp[63:0] // scale back down IF imm8[3] == 0 //check SPE IF src[63:0] != dst[63:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[63:0] } dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_roundscale_sd'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func RoundscaleSs ¶
RoundscaleSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
RoundTo_IntegerPS(src[31:0], imm8[7:0]){ IF(imm8[2] == 1) rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC ELSE rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0] FI M := imm8[7:4] // The scaling factor (number of fraction bits to round to) CASE(rounding_direction) 0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0]) 1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0]) 2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0]) 3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0]) ESAC dst[31:0] := 2^-M * tmp[31:0] // scale back down IF imm8[3] == 0 //check SPE IF src[31:0] != dst[31:0] //check if precision has been lost set_precision() //set #PE FI FI RETURN dst[31:0] } dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0]) dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_roundscale_ss'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func Rsqrt14Sd ¶
Rsqrt14Sd: Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.
dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0])) dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VRSQRT14SD'. Intrinsic: '_mm_rsqrt14_sd'. Requires AVX512F.
func Rsqrt14Ss ¶
Rsqrt14Ss: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.
dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0])) dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VRSQRT14SS'. Intrinsic: '_mm_rsqrt14_ss'. Requires AVX512F.
func ScalefPd ¶
ScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } FOR j := 0 to 1 i := j*64 dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VSCALEFPD'. Intrinsic: '_mm_scalef_pd'. Requires AVX512F.
func ScalefPs ¶
ScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[31:0] } FOR j := 0 to 3 i := j*32 dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VSCALEFPS'. Intrinsic: '_mm_scalef_ps'. Requires AVX512F.
func ScalefRoundSd ¶
ScalefRoundSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } dst[63:0] := SCALE(a[63:0], b[63:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSCALEFSD'. Intrinsic: '_mm_scalef_round_sd'. Requires AVX512F.
func ScalefRoundSs ¶
ScalefRoundSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } dst[31:0] := SCALE(a[31:0], b[31:0]) dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSCALEFSS'. Intrinsic: '_mm_scalef_round_ss'. Requires AVX512F.
func ScalefSd ¶
ScalefSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0])) RETURN dst[63:0] } dst[63:0] := SCALE(a[63:0], b[63:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSCALEFSD'. Intrinsic: '_mm_scalef_sd'. Requires AVX512F.
func ScalefSs ¶
ScalefSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
SCALE(src1, src2){ IF (src2 == NaN) IF (src2 == SNaN) RETURN QNAN(src2) FI ELSE IF (src1 == NaN) IF (src1 == SNaN) RETURN QNAN(src1) FI IF (src2 != INF) RETURN QNAN(src1) FI ELSE tmp_src2 := src2 tmp_src1 := src1 IF (src2 is denormal AND MXCSR.DAZ) tmp_src2 := 0 FI IF (src1 is denormal AND MXCSR.DAZ) tmp_src1 := 0 FI FI dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0])) RETURN dst[63:0] } dst[31:0] := SCALE(a[31:0], b[31:0]) dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSCALEFSS'. Intrinsic: '_mm_scalef_ss'. Requires AVX512F.
func SqrtRoundSd ¶
SqrtRoundSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := SQRT(a[63:0]) dst[127:64] := b[127:64] dst[MAX:128] := 0
Instruction: 'VSQRTSD'. Intrinsic: '_mm_sqrt_round_sd'. Requires AVX512F.
func SqrtRoundSs ¶
SqrtRoundSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := SQRT(a[31:0]) dst[127:32] := b[127:32] dst[MAX:128] := 0
Instruction: 'VSQRTSS'. Intrinsic: '_mm_sqrt_round_ss'. Requires AVX512F.
func SraEpi64 ¶
SraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.
FOR j := 0 to 1 i := j*64 IF count[63:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0]) FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm_sra_epi64'. Requires AVX512F.
func SraiEpi64 ¶
SraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.
FOR j := 0 to 1 i := j*64 IF imm8[7:0] > 63 dst[i+63:i] := SignBit ELSE dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0]) FI ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAQ'. Intrinsic: '_mm_srai_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func SravEpi64 ¶
SravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.
FOR j := 0 to 1 i := j*64 dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i]) ENDFOR dst[MAX:128] := 0
Instruction: 'VPSRAVQ'. Intrinsic: '_mm_srav_epi64'. Requires AVX512F.
func SubRoundSd ¶
SubRoundSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[63:0] := a[63:0] - b[63:0] dst[127:64] := a[127:64] dst[MAX:128] := 0
Instruction: 'VSUBSD'. Intrinsic: '_mm_sub_round_sd'. Requires AVX512F.
func SubRoundSs ¶
SubRoundSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.
Rounding is done according to the 'rounding' parameter, which can be one
of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE dst[31:0] := a[31:0] - b[31:0] dst[127:32] := a[127:32] dst[MAX:128] := 0
Instruction: 'VSUBSS'. Intrinsic: '_mm_sub_round_ss'. Requires AVX512F.
func TernarylogicEpi32 ¶
TernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.
FOR j := 0 to 3 i := j*32 FOR h := 0 to 31 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:128] := 0
Instruction: 'VPTERNLOGD'. Intrinsic: '_mm_ternarylogic_epi32'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func TernarylogicEpi64 ¶
TernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.
FOR j := 0 to 1 i := j*64 FOR h := 0 to 63 index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h] dst[i+h] := imm8[index[2:0]] ENDFOR ENDFOR dst[MAX:128] := 0
Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm_ternarylogic_epi64'. Requires AVX512F.
FIXME: Requires compiler support (has immediate)
func TestEpi32Mask ¶
TestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.
FOR j := 0 to 3 i := j*32 k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPTESTMD'. Intrinsic: '_mm_test_epi32_mask'. Requires AVX512F.
func TestEpi64Mask ¶
TestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.
FOR j := 0 to 1 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPTESTMQ'. Intrinsic: '_mm_test_epi64_mask'. Requires AVX512F.
func TestnEpi32Mask ¶
TestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.
FOR j := 0 to 3 i := j*32 k[j] := ((a[i+31:i] NAND b[i+31:i]) == 0) ? 1 : 0 ENDFOR k[MAX:4] := 0
Instruction: 'VPTESTNMD'. Intrinsic: '_mm_testn_epi32_mask'. Requires AVX512F.
func TestnEpi64Mask ¶
TestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.
FOR j := 0 to 1 i := j*64 k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 ENDFOR k[MAX:2] := 0
Instruction: 'VPTESTNMQ'. Intrinsic: '_mm_testn_epi64_mask'. Requires AVX512F.
Types ¶
This section is empty.