avx512f

package

v0.0.0-...-3878f85 Latest Latest Go to latest Published: Jul 23, 2017 License: MIT Imports: 1 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/klauspost/intrinsics

Links

Open Source Insights

Documentation ¶

Overview ¶

THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!

THEY DO NOT NOT CONTAIN WORKING INTRINSICS!

See https://github.com/klauspost/intrinsics

Index ¶

func AbsEpi64(a x86.M128i) (dst x86.M128i)
func AddRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func AddRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func CmpEpi32Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func CmpEpi64Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func CmpEpu32Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func CmpEpu64Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func CmpPdMask(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)
func CmpPsMask(a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)
func CmpRoundSdMask(a x86.M128d, b x86.M128d, imm8 byte, sae int) (dst x86.Mmask8)
func CmpRoundSsMask(a x86.M128, b x86.M128, imm8 byte, sae int) (dst x86.Mmask8)
func CmpSdMask(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)
func CmpSsMask(a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)
func CmpeqEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpeqEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpeqEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpeqEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgeEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgeEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgeEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgeEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgtEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgtEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgtEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgtEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpleEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpleEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpleEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpleEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpltEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpltEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpltEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpltEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpneqEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpneqEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpneqEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpneqEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func ComiRoundSd(a x86.M128d, b x86.M128d, imm8 byte, sae int) int
func ComiRoundSs(a x86.M128, b x86.M128, imm8 byte, sae int) int
func CvtRoundi32Ss(a x86.M128, b int, rounding int) (dst x86.M128)
func CvtRoundi64Sd(a x86.M128d, b int64, rounding int) (dst x86.M128d)
func CvtRoundi64Ss(a x86.M128, b int64, rounding int) (dst x86.M128)
func CvtRoundsdI32(a x86.M128d, rounding int) int
func CvtRoundsdI64(a x86.M128d, rounding int) int64
func CvtRoundsdSi32(a x86.M128d, rounding int) int
func CvtRoundsdSi64(a x86.M128d, rounding int) int64
func CvtRoundsdSs(a x86.M128, b x86.M128d, rounding int) (dst x86.M128)
func CvtRoundsdU32(a x86.M128d, rounding int) uint32
func CvtRoundsdU64(a x86.M128d, rounding int) uint64
func CvtRoundsi32Ss(a x86.M128, b int, rounding int) (dst x86.M128)
func CvtRoundsi64Sd(a x86.M128d, b int64, rounding int) (dst x86.M128d)
func CvtRoundsi64Ss(a x86.M128, b int64, rounding int) (dst x86.M128)
func CvtRoundssI32(a x86.M128, rounding int) int
func CvtRoundssI64(a x86.M128, rounding int) int64
func CvtRoundssSd(a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)
func CvtRoundssSi32(a x86.M128, rounding int) int
func CvtRoundssSi64(a x86.M128, rounding int) int64
func CvtRoundssU32(a x86.M128, rounding int) uint32
func CvtRoundssU64(a x86.M128, rounding int) uint64
func CvtRoundu32Ss(a x86.M128, b uint32, rounding int) (dst x86.M128)
func CvtRoundu64Sd(a x86.M128d, b uint64, rounding int) (dst x86.M128d)
func CvtRoundu64Ss(a x86.M128, b uint64, rounding int) (dst x86.M128)
func Cvtepi32Epi16(a x86.M128i) (dst x86.M128i)
func Cvtepi32Epi8(a x86.M128i) (dst x86.M128i)
func Cvtepi64Epi16(a x86.M128i) (dst x86.M128i)
func Cvtepi64Epi32(a x86.M128i) (dst x86.M128i)
func Cvtepi64Epi8(a x86.M128i) (dst x86.M128i)
func Cvtepu32Pd(a x86.M128i) (dst x86.M128d)
func Cvti32Sd(a x86.M128d, b int) (dst x86.M128d)
func Cvti32Ss(a x86.M128, b int) (dst x86.M128)
func Cvti64Sd(a x86.M128d, b int64) (dst x86.M128d)
func Cvti64Ss(a x86.M128, b int64) (dst x86.M128)
func CvtpdEpu32(a x86.M128d) (dst x86.M128i)
func CvtpsEpu32(a x86.M128) (dst x86.M128i)
func CvtsdI32(a x86.M128d) int
func CvtsdI64(a x86.M128d) int64
func CvtsdU32(a x86.M128d) uint32
func CvtsdU64(a x86.M128d) uint64
func Cvtsepi32Epi16(a x86.M128i) (dst x86.M128i)
func Cvtsepi32Epi8(a x86.M128i) (dst x86.M128i)
func Cvtsepi64Epi16(a x86.M128i) (dst x86.M128i)
func Cvtsepi64Epi32(a x86.M128i) (dst x86.M128i)
func Cvtsepi64Epi8(a x86.M128i) (dst x86.M128i)
func CvtssI32(a x86.M128) int
func CvtssI64(a x86.M128) int64
func CvtssU32(a x86.M128) uint32
func CvtssU64(a x86.M128) uint64
func CvttRoundsdI32(a x86.M128d, rounding int) int
func CvttRoundsdI64(a x86.M128d, rounding int) int64
func CvttRoundsdSi32(a x86.M128d, rounding int) int
func CvttRoundsdSi64(a x86.M128d, rounding int) int64
func CvttRoundsdU32(a x86.M128d, rounding int) uint32
func CvttRoundsdU64(a x86.M128d, rounding int) uint64
func CvttRoundssI32(a x86.M128, rounding int) int
func CvttRoundssI64(a x86.M128, rounding int) int64
func CvttRoundssSi32(a x86.M128, rounding int) int
func CvttRoundssSi64(a x86.M128, rounding int) int64
func CvttRoundssU32(a x86.M128, rounding int) uint32
func CvttRoundssU64(a x86.M128, rounding int) uint64
func CvttpdEpu32(a x86.M128d) (dst x86.M128i)
func CvttpsEpu32(a x86.M128) (dst x86.M128i)
func CvttsdI32(a x86.M128d) int
func CvttsdI64(a x86.M128d) int64
func CvttsdU32(a x86.M128d) uint32
func CvttsdU64(a x86.M128d) uint64
func CvttssI32(a x86.M128) int
func CvttssI64(a x86.M128) int64
func CvttssU32(a x86.M128) uint32
func CvttssU64(a x86.M128) uint64
func Cvtu32Sd(a x86.M128d, b uint32) (dst x86.M128d)
func Cvtu32Ss(a x86.M128, b uint32) (dst x86.M128)
func Cvtu64Sd(a x86.M128d, b uint64) (dst x86.M128d)
func Cvtu64Ss(a x86.M128, b uint64) (dst x86.M128)
func Cvtusepi32Epi16(a x86.M128i) (dst x86.M128i)
func Cvtusepi32Epi8(a x86.M128i) (dst x86.M128i)
func Cvtusepi64Epi16(a x86.M128i) (dst x86.M128i)
func Cvtusepi64Epi32(a x86.M128i) (dst x86.M128i)
func Cvtusepi64Epi8(a x86.M128i) (dst x86.M128i)
func DivRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func DivRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func FixupimmPd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
func FixupimmPs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
func FixupimmRoundSd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)
func FixupimmRoundSs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)
func FixupimmSd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
func FixupimmSs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
func GetexpPd(a x86.M128d) (dst x86.M128d)
func GetexpPs(a x86.M128) (dst x86.M128)
func GetexpRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func GetexpRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func GetexpSd(a x86.M128d, b x86.M128d) (dst x86.M128d)
func GetexpSs(a x86.M128, b x86.M128) (dst x86.M128)
func GetmantPd(a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
func GetmantPs(a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
func GetmantRoundSd(a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, ...) (dst x86.M128d)
func GetmantRoundSs(a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, ...) (dst x86.M128)
func GetmantSd(a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
func GetmantSs(a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
func M256AbsEpi64(a x86.M256i) (dst x86.M256i)
func M256BroadcastF32x4(a x86.M128) (dst x86.M256)
func M256BroadcastI32x4(a x86.M128i) (dst x86.M256i)
func M256CmpEpi32Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
func M256CmpEpi64Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
func M256CmpEpu32Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
func M256CmpEpu64Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
func M256CmpPdMask(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.Mmask8)
func M256CmpPsMask(a x86.M256, b x86.M256, imm8 byte) (dst x86.Mmask8)
func M256CmpeqEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpeqEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpeqEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpeqEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpgeEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpgeEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpgeEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpgeEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpgtEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpgtEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpgtEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpgtEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpleEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpleEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpleEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpleEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpltEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpltEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpltEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpltEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpneqEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpneqEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpneqEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256CmpneqEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256Cvtepi32Epi16(a x86.M256i) (dst x86.M128i)
func M256Cvtepi32Epi8(a x86.M256i) (dst x86.M128i)
func M256Cvtepi64Epi16(a x86.M256i) (dst x86.M128i)
func M256Cvtepi64Epi32(a x86.M256i) (dst x86.M128i)
func M256Cvtepi64Epi8(a x86.M256i) (dst x86.M128i)
func M256Cvtepu32Pd(a x86.M128i) (dst x86.M256d)
func M256CvtpdEpu32(a x86.M256d) (dst x86.M128i)
func M256CvtpsEpu32(a x86.M256) (dst x86.M256i)
func M256Cvtsepi32Epi16(a x86.M256i) (dst x86.M128i)
func M256Cvtsepi32Epi8(a x86.M256i) (dst x86.M128i)
func M256Cvtsepi64Epi16(a x86.M256i) (dst x86.M128i)
func M256Cvtsepi64Epi32(a x86.M256i) (dst x86.M128i)
func M256Cvtsepi64Epi8(a x86.M256i) (dst x86.M128i)
func M256CvttpdEpu32(a x86.M256d) (dst x86.M128i)
func M256CvttpsEpu32(a x86.M256) (dst x86.M256i)
func M256Cvtusepi32Epi16(a x86.M256i) (dst x86.M128i)
func M256Cvtusepi32Epi8(a x86.M256i) (dst x86.M128i)
func M256Cvtusepi64Epi16(a x86.M256i) (dst x86.M128i)
func M256Cvtusepi64Epi32(a x86.M256i) (dst x86.M128i)
func M256Cvtusepi64Epi8(a x86.M256i) (dst x86.M128i)
func M256Extractf32x4Ps(a x86.M256, imm8 byte) (dst x86.M128)
func M256Extracti32x4Epi32(a x86.M256i, imm8 byte) (dst x86.M128i)
func M256FixupimmPd(a x86.M256d, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)
func M256FixupimmPs(a x86.M256, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)
func M256GetexpPd(a x86.M256d) (dst x86.M256d)
func M256GetexpPs(a x86.M256) (dst x86.M256)
func M256GetmantPd(a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)
func M256GetmantPs(a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)
func M256Insertf32x4(a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)
func M256Inserti32x4(a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
func M256Mask2Permutex2varEpi32(a x86.M256i, idx x86.M256i, k x86.Mmask8, b x86.M256i) (dst x86.M256i)
func M256Mask2Permutex2varEpi64(a x86.M256i, idx x86.M256i, k x86.Mmask8, b x86.M256i) (dst x86.M256i)
func M256Mask2Permutex2varPd(a x86.M256d, idx x86.M256i, k x86.Mmask8, b x86.M256d) (dst x86.M256d)
func M256Mask2Permutex2varPs(a x86.M256, idx x86.M256i, k x86.Mmask8, b x86.M256) (dst x86.M256)
func M256Mask3FmaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
func M256Mask3FmaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
func M256Mask3FmaddsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
func M256Mask3FmaddsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
func M256Mask3FmsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
func M256Mask3FmsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
func M256Mask3FmsubaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
func M256Mask3FmsubaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
func M256Mask3FnmaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
func M256Mask3FnmaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
func M256Mask3FnmsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)
func M256Mask3FnmsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)
func M256MaskAbsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskAbsEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskAddEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAddEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAndEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAndEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAndnotEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAndnotEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskBlendEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskBlendEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskBlendPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskBlendPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskBroadcastF32x4(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)
func M256MaskBroadcastI32x4(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskBroadcastdEpi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskBroadcastqEpi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskBroadcastsdPd(src x86.M256d, k x86.Mmask8, a x86.M128d) (dst x86.M256d)
func M256MaskBroadcastssPs(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)
func M256MaskCmpEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
func M256MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
func M256MaskCmpEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
func M256MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)
func M256MaskCmpPdMask(k1 x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.Mmask8)
func M256MaskCmpPsMask(k1 x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.Mmask8)
func M256MaskCmpeqEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpeqEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpgeEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpgeEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpgtEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpgtEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpleEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpleEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpltEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpltEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpneqEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpneqEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskCompressEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskCompressEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskCompressPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskCompressPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskCvtRoundpsPh(src x86.M128i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)
func M256MaskCvtepi16Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepi16Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtepi32Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtepi32Pd(src x86.M256d, k x86.Mmask8, a x86.M128i) (dst x86.M256d)
func M256MaskCvtepi32Ps(src x86.M256, k x86.Mmask8, a x86.M256i) (dst x86.M256)
func M256MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtepi8Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepi8Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepu16Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepu16Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepu32Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepu32Pd(src x86.M256d, k x86.Mmask8, a x86.M128i) (dst x86.M256d)
func M256MaskCvtepu8Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepu8Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)
func M256MaskCvtpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)
func M256MaskCvtpdPs(src x86.M128, k x86.Mmask8, a x86.M256d) (dst x86.M128)
func M256MaskCvtphPs(src x86.M256, k x86.Mmask8, a x86.M128i) (dst x86.M256)
func M256MaskCvtpsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)
func M256MaskCvtpsEpu32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)
func M256MaskCvtpsPh(src x86.M128i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)
func M256MaskCvtsepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtsepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvttpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)
func M256MaskCvttpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)
func M256MaskCvttpsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)
func M256MaskCvttpsEpu32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)
func M256MaskCvtusepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtusepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskDivPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskDivPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskExpandEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskExpandEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskExpandPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskExpandPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskExtractf32x4Ps(src x86.M128, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M128)
func M256MaskExtracti32x4Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)
func M256MaskFixupimmPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)
func M256MaskFixupimmPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)
func M256MaskFmaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskFmaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskFmaddsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskFmaddsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskFmsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskFmsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskFmsubaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskFmsubaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskFnmaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskFnmaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskFnmsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskFnmsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskGetexpPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskGetexpPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskGetmantPd(src x86.M256d, k x86.Mmask8, a x86.M256d, interv MMMANTISSANORMENUM, ...) (dst x86.M256d)
func M256MaskGetmantPs(src x86.M256, k x86.Mmask8, a x86.M256, interv MMMANTISSANORMENUM, ...) (dst x86.M256)
func M256MaskInsertf32x4(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)
func M256MaskInserti32x4(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
func M256MaskMaxEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMaxEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMaxEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMaxEpu64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMaxPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskMaxPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskMinEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMinEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMinEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMinEpu64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMinPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskMinPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskMovEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskMovEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskMovPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskMovPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskMovedupPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskMovehdupPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskMoveldupPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskMulEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMulEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMulPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskMulPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskMulloEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskOrEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskOrEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskPermutePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskPermutePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)
func M256MaskPermutevarPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256i) (dst x86.M256d)
func M256MaskPermutevarPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256i) (dst x86.M256)
func M256MaskPermutex2varEpi32(a x86.M256i, k x86.Mmask8, idx x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskPermutex2varEpi64(a x86.M256i, k x86.Mmask8, idx x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskPermutex2varPd(a x86.M256d, k x86.Mmask8, idx x86.M256i, b x86.M256d) (dst x86.M256d)
func M256MaskPermutex2varPs(a x86.M256, k x86.Mmask8, idx x86.M256i, b x86.M256) (dst x86.M256)
func M256MaskPermutexEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskPermutexPd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskPermutexvarEpi32(src x86.M256i, k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)
func M256MaskPermutexvarEpi64(src x86.M256i, k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)
func M256MaskPermutexvarPd(src x86.M256d, k x86.Mmask8, idx x86.M256i, a x86.M256d) (dst x86.M256d)
func M256MaskPermutexvarPs(src x86.M256, k x86.Mmask8, idx x86.M256i, a x86.M256) (dst x86.M256)
func M256MaskRcp14Pd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskRcp14Ps(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskRolEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskRolEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskRolvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskRolvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskRorEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskRorEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskRorvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskRorvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskRoundscalePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskRoundscalePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)
func M256MaskRsqrt14Pd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskRsqrt14Ps(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskScalefPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskScalefPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskSet1Epi32(src x86.M256i, k x86.Mmask8, a int) (dst x86.M256i)
func M256MaskSet1Epi64(src x86.M256i, k x86.Mmask8, a int64) (dst x86.M256i)
func M256MaskShuffleEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskShuffleF32x4(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
func M256MaskShuffleF64x2(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskShuffleI32x4(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskShuffleI64x2(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskShufflePd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskShufflePs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
func M256MaskSllEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskSllEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskSlliEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskSlliEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskSllvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskSllvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskSqrtPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskSqrtPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskSraEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskSraEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskSraiEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskSraiEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskSravEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskSravEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskSrlEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskSrlEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskSrliEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskSrliEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskSrlvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskSrlvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskSubEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskSubEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskSubPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskSubPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskTernarylogicEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskTernarylogicEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskTestEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskTestnEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256MaskUnpackhiEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskUnpackhiEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskUnpackhiPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskUnpackhiPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskUnpackloEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskUnpackloEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskUnpackloPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskUnpackloPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskXorEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskXorEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAbsEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskzAbsEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskzAddEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAddEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAndEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAndEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAndnotEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAndnotEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzBroadcastF32x4(k x86.Mmask8, a x86.M128) (dst x86.M256)
func M256MaskzBroadcastI32x4(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzBroadcastdEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzBroadcastsdPd(k x86.Mmask8, a x86.M128d) (dst x86.M256d)
func M256MaskzBroadcastssPs(k x86.Mmask8, a x86.M128) (dst x86.M256)
func M256MaskzCompressEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskzCompressEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskzCompressPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskzCompressPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskzCvtRoundpsPh(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)
func M256MaskzCvtepi16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtepi32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M256d)
func M256MaskzCvtepi32Ps(k x86.Mmask8, a x86.M256i) (dst x86.M256)
func M256MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtepi8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepu16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepu32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M256d)
func M256MaskzCvtepu8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtpdEpi32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)
func M256MaskzCvtpdEpu32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)
func M256MaskzCvtpdPs(k x86.Mmask8, a x86.M256d) (dst x86.M128)
func M256MaskzCvtphPs(k x86.Mmask8, a x86.M128i) (dst x86.M256)
func M256MaskzCvtpsEpi32(k x86.Mmask8, a x86.M256) (dst x86.M256i)
func M256MaskzCvtpsEpu32(k x86.Mmask8, a x86.M256) (dst x86.M256i)
func M256MaskzCvtpsPh(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)
func M256MaskzCvtsepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtsepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvttpdEpi32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)
func M256MaskzCvttpdEpu32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)
func M256MaskzCvttpsEpi32(k x86.Mmask8, a x86.M256) (dst x86.M256i)
func M256MaskzCvttpsEpu32(k x86.Mmask8, a x86.M256) (dst x86.M256i)
func M256MaskzCvtusepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtusepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)
func M256MaskzDivPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzDivPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzExpandEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskzExpandEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskzExpandPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskzExpandPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskzExtractf32x4Ps(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M128)
func M256MaskzExtracti32x4Epi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)
func M256MaskzFixupimmPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)
func M256MaskzFixupimmPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)
func M256MaskzFmaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskzFmaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskzFmaddsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskzFmaddsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskzFmsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskzFmsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskzFmsubaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskzFmsubaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskzFnmaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskzFnmaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskzFnmsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)
func M256MaskzFnmsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)
func M256MaskzGetexpPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskzGetexpPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskzGetmantPd(k x86.Mmask8, a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)
func M256MaskzGetmantPs(k x86.Mmask8, a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)
func M256MaskzInsertf32x4(k x86.Mmask8, a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)
func M256MaskzInserti32x4(k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
func M256MaskzMaxEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMaxEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMaxEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMaxEpu64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMaxPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzMaxPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzMinEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMinEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMinEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMinEpu64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMinPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzMinPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzMovEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskzMovEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)
func M256MaskzMovPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskzMovPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskzMovedupPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskzMovehdupPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskzMoveldupPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskzMulEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMulEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMulPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzMulPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzMulloEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzOrEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzOrEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzPermutePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskzPermutePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)
func M256MaskzPermutevarPd(k x86.Mmask8, a x86.M256d, b x86.M256i) (dst x86.M256d)
func M256MaskzPermutevarPs(k x86.Mmask8, a x86.M256, b x86.M256i) (dst x86.M256)
func M256MaskzPermutex2varEpi32(k x86.Mmask8, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzPermutex2varPd(k x86.Mmask8, a x86.M256d, idx x86.M256i, b x86.M256d) (dst x86.M256d)
func M256MaskzPermutex2varPs(k x86.Mmask8, a x86.M256, idx x86.M256i, b x86.M256) (dst x86.M256)
func M256MaskzPermutexEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzPermutexPd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskzPermutexvarEpi32(k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)
func M256MaskzPermutexvarEpi64(k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)
func M256MaskzPermutexvarPd(k x86.Mmask8, idx x86.M256i, a x86.M256d) (dst x86.M256d)
func M256MaskzPermutexvarPs(k x86.Mmask8, idx x86.M256i, a x86.M256) (dst x86.M256)
func M256MaskzRcp14Pd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskzRcp14Ps(k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskzRolEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzRolEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzRolvEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzRolvEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzRorEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzRorEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzRorvEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzRorvEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzRoundscalePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskzRoundscalePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)
func M256MaskzRsqrt14Pd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskzRsqrt14Ps(k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskzScalefPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzScalefPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzSet1Epi32(k x86.Mmask8, a int) (dst x86.M256i)
func M256MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M256i)
func M256MaskzShuffleEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzShuffleF32x4(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
func M256MaskzShuffleF64x2(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskzShuffleI32x4(k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzShuffleI64x2(k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzShufflePd(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskzShufflePs(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
func M256MaskzSllEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskzSllEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskzSlliEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzSlliEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzSllvEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskzSllvEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskzSqrtPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)
func M256MaskzSqrtPs(k x86.Mmask8, a x86.M256) (dst x86.M256)
func M256MaskzSraEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskzSraEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskzSraiEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzSraiEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzSravEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskzSravEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskzSrlEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskzSrlEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskzSrliEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzSrliEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzSrlvEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskzSrlvEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskzSubEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzSubEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzSubPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzSubPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzTernarylogicEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzUnpackhiEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzUnpackhiPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzUnpackhiPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzUnpackloEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzUnpackloEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzUnpackloPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzUnpackloPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzXorEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzXorEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaxEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaxEpu64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MinEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MinEpu64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256Permutex2varEpi32(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
func M256Permutex2varEpi64(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
func M256Permutex2varPd(a x86.M256d, idx x86.M256i, b x86.M256d) (dst x86.M256d)
func M256Permutex2varPs(a x86.M256, idx x86.M256i, b x86.M256) (dst x86.M256)
func M256PermutexEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256PermutexPd(a x86.M256d, imm8 byte) (dst x86.M256d)
func M256PermutexvarEpi32(idx x86.M256i, a x86.M256i) (dst x86.M256i)
func M256PermutexvarEpi64(idx x86.M256i, a x86.M256i) (dst x86.M256i)
func M256PermutexvarPd(idx x86.M256i, a x86.M256d) (dst x86.M256d)
func M256PermutexvarPs(idx x86.M256i, a x86.M256) (dst x86.M256)
func M256Rcp14Pd(a x86.M256d) (dst x86.M256d)
func M256Rcp14Ps(a x86.M256) (dst x86.M256)
func M256RolEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256RolEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256RolvEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256RolvEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256RorEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256RorEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256RorvEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256RorvEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256RoundscalePd(a x86.M256d, imm8 byte) (dst x86.M256d)
func M256RoundscalePs(a x86.M256, imm8 byte) (dst x86.M256)
func M256ScalefPd(a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256ScalefPs(a x86.M256, b x86.M256) (dst x86.M256)
func M256ShuffleF32x4(a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
func M256ShuffleF64x2(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
func M256ShuffleI32x4(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256ShuffleI64x2(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256SraEpi64(a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256SraiEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SravEpi64(a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256TernarylogicEpi32(a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)
func M256TernarylogicEpi64(a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)
func M256TestEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256TestEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256TestnEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M256TestnEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)
func M512AbsEpi32(a x86.M512i) (dst x86.M512i)
func M512AbsEpi64(a x86.M512i) (dst x86.M512i)
func M512AcosPd(a x86.M512d) (dst x86.M512d)
func M512AcosPs(a x86.M512) (dst x86.M512)
func M512AcoshPd(a x86.M512d) (dst x86.M512d)
func M512AcoshPs(a x86.M512) (dst x86.M512)
func M512AddEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AlignrEpi64(a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
func M512AsinPd(a x86.M512d) (dst x86.M512d)
func M512AsinPs(a x86.M512) (dst x86.M512)
func M512AsinhPd(a x86.M512d) (dst x86.M512d)
func M512AsinhPs(a x86.M512) (dst x86.M512)
func M512Atan2Pd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512Atan2Ps(a x86.M512, b x86.M512) (dst x86.M512)
func M512AtanPd(a x86.M512d) (dst x86.M512d)
func M512AtanPs(a x86.M512) (dst x86.M512)
func M512AtanhPd(a x86.M512d) (dst x86.M512d)
func M512AtanhPs(a x86.M512) (dst x86.M512)
func M512BroadcastF32x4(a x86.M128) (dst x86.M512)
func M512BroadcastF64x4(a x86.M256d) (dst x86.M512d)
func M512BroadcastI32x4(a x86.M128i) (dst x86.M512i)
func M512BroadcastI64x4(a x86.M256i) (dst x86.M512i)
func M512BroadcastdEpi32(a x86.M128i) (dst x86.M512i)
func M512BroadcastqEpi64(a x86.M128i) (dst x86.M512i)
func M512BroadcastsdPd(a x86.M128d) (dst x86.M512d)
func M512BroadcastssPs(a x86.M128) (dst x86.M512)
func M512Castpd128Pd512(a x86.M128d) (dst x86.M512d)
func M512Castpd256Pd512(a x86.M256d) (dst x86.M512d)
func M512Castpd512Pd128(a x86.M512d) (dst x86.M128d)
func M512Castpd512Pd256(a x86.M512d) (dst x86.M256d)
func M512Castps128Ps512(a x86.M128) (dst x86.M512)
func M512Castps256Ps512(a x86.M256) (dst x86.M512)
func M512Castps512Ps128(a x86.M512) (dst x86.M128)
func M512Castps512Ps256(a x86.M512) (dst x86.M256)
func M512Castsi128Si512(a x86.M128i) (dst x86.M512i)
func M512Castsi256Si512(a x86.M256i) (dst x86.M512i)
func M512Castsi512Si128(a x86.M512i) (dst x86.M128i)
func M512Castsi512Si256(a x86.M512i) (dst x86.M256i)
func M512CbrtPd(a x86.M512d) (dst x86.M512d)
func M512CbrtPs(a x86.M512) (dst x86.M512)
func M512CdfnormPd(a x86.M512d) (dst x86.M512d)
func M512CdfnormPs(a x86.M512) (dst x86.M512)
func M512CdfnorminvPd(a x86.M512d) (dst x86.M512d)
func M512CdfnorminvPs(a x86.M512) (dst x86.M512)
func M512CeilPd(a x86.M512d) (dst x86.M512d)
func M512CeilPs(a x86.M512) (dst x86.M512)
func M512CmpEpi64Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)
func M512CmpEpu64Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)
func M512CmpeqEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpeqEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpgeEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpgeEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpgtEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpgtEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpleEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpleEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpltEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpltEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpltEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpneqEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CmpneqEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512CosPd(a x86.M512d) (dst x86.M512d)
func M512CosPs(a x86.M512) (dst x86.M512)
func M512CosdPd(a x86.M512d) (dst x86.M512d)
func M512CosdPs(a x86.M512) (dst x86.M512)
func M512CoshPd(a x86.M512d) (dst x86.M512d)
func M512CoshPs(a x86.M512) (dst x86.M512)
func M512CvtRoundepi32Ps(a x86.M512i, rounding int) (dst x86.M512)
func M512CvtRoundepu32Ps(a x86.M512i, rounding int) (dst x86.M512)
func M512CvtRoundpdEpi32(a x86.M512d, rounding int) (dst x86.M256i)
func M512CvtRoundpdEpu32(a x86.M512d, rounding int) (dst x86.M256i)
func M512CvtRoundpdPs(a x86.M512d, rounding int) (dst x86.M256)
func M512CvtRoundphPs(a x86.M256i, sae int) (dst x86.M512)
func M512CvtRoundpsEpi32(a x86.M512, rounding int) (dst x86.M512i)
func M512CvtRoundpsEpu32(a x86.M512, rounding int) (dst x86.M512i)
func M512CvtRoundpsPd(a x86.M256, sae int) (dst x86.M512d)
func M512CvtRoundpsPh(a x86.M512, rounding int) (dst x86.M256i)
func M512Cvtepi16Epi32(a x86.M256i) (dst x86.M512i)
func M512Cvtepi16Epi64(a x86.M128i) (dst x86.M512i)
func M512Cvtepi32Epi16(a x86.M512i) (dst x86.M256i)
func M512Cvtepi32Epi64(a x86.M256i) (dst x86.M512i)
func M512Cvtepi32Epi8(a x86.M512i) (dst x86.M128i)
func M512Cvtepi32Pd(a x86.M256i) (dst x86.M512d)
func M512Cvtepi32Ps(a x86.M512i) (dst x86.M512)
func M512Cvtepi64Epi16(a x86.M512i) (dst x86.M128i)
func M512Cvtepi64Epi32(a x86.M512i) (dst x86.M256i)
func M512Cvtepi64Epi8(a x86.M512i) (dst x86.M128i)
func M512Cvtepi8Epi32(a x86.M128i) (dst x86.M512i)
func M512Cvtepi8Epi64(a x86.M128i) (dst x86.M512i)
func M512Cvtepu16Epi32(a x86.M256i) (dst x86.M512i)
func M512Cvtepu16Epi64(a x86.M128i) (dst x86.M512i)
func M512Cvtepu32Epi64(a x86.M256i) (dst x86.M512i)
func M512Cvtepu32Pd(a x86.M256i) (dst x86.M512d)
func M512Cvtepu32Ps(a x86.M512i) (dst x86.M512)
func M512Cvtepu8Epi32(a x86.M128i) (dst x86.M512i)
func M512Cvtepu8Epi64(a x86.M128i) (dst x86.M512i)
func M512CvtpdEpi32(a x86.M512d) (dst x86.M256i)
func M512CvtpdEpu32(a x86.M512d) (dst x86.M256i)
func M512CvtpdPs(a x86.M512d) (dst x86.M256)
func M512CvtphPs(a x86.M256i) (dst x86.M512)
func M512CvtpsEpi32(a x86.M512) (dst x86.M512i)
func M512CvtpsEpu32(a x86.M512) (dst x86.M512i)
func M512CvtpsPd(a x86.M256) (dst x86.M512d)
func M512CvtpsPh(a x86.M512, rounding int) (dst x86.M256i)
func M512Cvtsepi32Epi16(a x86.M512i) (dst x86.M256i)
func M512Cvtsepi32Epi8(a x86.M512i) (dst x86.M128i)
func M512Cvtsepi64Epi16(a x86.M512i) (dst x86.M128i)
func M512Cvtsepi64Epi32(a x86.M512i) (dst x86.M256i)
func M512Cvtsepi64Epi8(a x86.M512i) (dst x86.M128i)
func M512CvttRoundpdEpi32(a x86.M512d, sae int) (dst x86.M256i)
func M512CvttRoundpdEpu32(a x86.M512d, sae int) (dst x86.M256i)
func M512CvttRoundpsEpi32(a x86.M512, sae int) (dst x86.M512i)
func M512CvttRoundpsEpu32(a x86.M512, sae int) (dst x86.M512i)
func M512CvttpdEpi32(a x86.M512d) (dst x86.M256i)
func M512CvttpdEpu32(a x86.M512d) (dst x86.M256i)
func M512CvttpsEpi32(a x86.M512) (dst x86.M512i)
func M512CvttpsEpu32(a x86.M512) (dst x86.M512i)
func M512Cvtusepi32Epi16(a x86.M512i) (dst x86.M256i)
func M512Cvtusepi32Epi8(a x86.M512i) (dst x86.M128i)
func M512Cvtusepi64Epi16(a x86.M512i) (dst x86.M128i)
func M512Cvtusepi64Epi32(a x86.M512i) (dst x86.M256i)
func M512Cvtusepi64Epi8(a x86.M512i) (dst x86.M128i)
func M512DivEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512DivEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512DivEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512DivEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512DivEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512DivEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512DivEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512DivEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512DivPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512DivPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512DivRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512DivRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512ErfPd(a x86.M512d) (dst x86.M512d)
func M512ErfPs(a x86.M512) (dst x86.M512)
func M512ErfcPd(a x86.M512d) (dst x86.M512d)
func M512ErfcPs(a x86.M512) (dst x86.M512)
func M512ErfcinvPd(a x86.M512d) (dst x86.M512d)
func M512ErfcinvPs(a x86.M512) (dst x86.M512)
func M512ErfinvPd(a x86.M512d) (dst x86.M512d)
func M512ErfinvPs(a x86.M512) (dst x86.M512)
func M512Exp10Pd(a x86.M512d) (dst x86.M512d)
func M512Exp10Ps(a x86.M512) (dst x86.M512)
func M512Exp2Pd(a x86.M512d) (dst x86.M512d)
func M512Exp2Ps(a x86.M512) (dst x86.M512)
func M512ExpPd(a x86.M512d) (dst x86.M512d)
func M512ExpPs(a x86.M512) (dst x86.M512)
func M512Expm1Pd(a x86.M512d) (dst x86.M512d)
func M512Expm1Ps(a x86.M512) (dst x86.M512)
func M512Extractf32x4Ps(a x86.M512, imm8 byte) (dst x86.M128)
func M512Extractf64x4Pd(a x86.M512d, imm8 byte) (dst x86.M256d)
func M512Extracti32x4Epi32(a x86.M512i, imm8 byte) (dst x86.M128i)
func M512Extracti64x4Epi64(a x86.M512i, imm8 byte) (dst x86.M256i)
func M512FixupimmPd(a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)
func M512FixupimmPs(a x86.M512, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)
func M512FixupimmRoundPd(a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)
func M512FixupimmRoundPs(a x86.M512, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)
func M512FloorPd(a x86.M512d) (dst x86.M512d)
func M512FloorPs(a x86.M512) (dst x86.M512)
func M512FmaddsubPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512FmaddsubPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512FmaddsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512FmaddsubRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512FmsubaddPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512FmsubaddPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512FmsubaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512FmsubaddRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512HypotPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512HypotPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512Insertf32x4(a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)
func M512Insertf64x4(a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)
func M512Inserti32x4(a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)
func M512Inserti64x4(a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)
func M512InvsqrtPd(a x86.M512d) (dst x86.M512d)
func M512InvsqrtPs(a x86.M512) (dst x86.M512)
func M512Kand(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Kandn(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Kmov(a x86.Mmask16) (dst x86.Mmask16)
func M512Knot(a x86.Mmask16) (dst x86.Mmask16)
func M512Kor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Kortestc(k1 x86.Mmask16, k2 x86.Mmask16) int
func M512Kortestz(k1 x86.Mmask16, k2 x86.Mmask16) int
func M512Kunpackb(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Kxnor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Kxor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Log10Pd(a x86.M512d) (dst x86.M512d)
func M512Log10Ps(a x86.M512) (dst x86.M512)
func M512Log1pPd(a x86.M512d) (dst x86.M512d)
func M512Log1pPs(a x86.M512) (dst x86.M512)
func M512Log2Pd(a x86.M512d) (dst x86.M512d)
func M512LogPd(a x86.M512d) (dst x86.M512d)
func M512LogPs(a x86.M512) (dst x86.M512)
func M512LogbPd(a x86.M512d) (dst x86.M512d)
func M512LogbPs(a x86.M512) (dst x86.M512)
func M512Mask2Permutex2varEpi32(a x86.M512i, idx x86.M512i, k x86.Mmask16, b x86.M512i) (dst x86.M512i)
func M512Mask2Permutex2varEpi64(a x86.M512i, idx x86.M512i, k x86.Mmask8, b x86.M512i) (dst x86.M512i)
func M512Mask2Permutex2varPd(a x86.M512d, idx x86.M512i, k x86.Mmask8, b x86.M512d) (dst x86.M512d)
func M512Mask2Permutex2varPs(a x86.M512, idx x86.M512i, k x86.Mmask16, b x86.M512) (dst x86.M512)
func M512Mask3FmaddsubPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)
func M512Mask3FmaddsubPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)
func M512Mask3FmaddsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)
func M512Mask3FmaddsubRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)
func M512Mask3FmsubaddPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)
func M512Mask3FmsubaddPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)
func M512Mask3FmsubaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)
func M512Mask3FmsubaddRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)
func M512MaskAbsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)
func M512MaskAbsEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)
func M512MaskAcosPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskAcosPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskAcoshPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskAcoshPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskAddEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAlignrEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
func M512MaskAsinPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskAsinPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskAsinhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskAsinhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskAtan2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskAtan2Ps(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskAtanPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskAtanPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskAtanhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskAtanhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskBroadcastF32x4(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)
func M512MaskBroadcastF64x4(src x86.M512d, k x86.Mmask8, a x86.M256d) (dst x86.M512d)
func M512MaskBroadcastI32x4(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)
func M512MaskBroadcastI64x4(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)
func M512MaskBroadcastdEpi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)
func M512MaskBroadcastqEpi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskBroadcastsdPd(src x86.M512d, k x86.Mmask8, a x86.M128d) (dst x86.M512d)
func M512MaskBroadcastssPs(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)
func M512MaskCbrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskCbrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskCdfnormPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskCdfnormPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskCdfnorminvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskCdfnorminvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskCeilPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskCeilPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)
func M512MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)
func M512MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpltEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskCompressEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)
func M512MaskCompressEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)
func M512MaskCompressPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskCompressPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskCosPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskCosPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskCosdPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskCosdPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskCoshPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskCoshPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskCvtRoundepi32Ps(src x86.M512, k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)
func M512MaskCvtRoundepu32Ps(src x86.M512, k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)
func M512MaskCvtRoundpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)
func M512MaskCvtRoundpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)
func M512MaskCvtRoundpdPs(src x86.M256, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256)
func M512MaskCvtRoundphPs(src x86.M512, k x86.Mmask16, a x86.M256i, sae int) (dst x86.M512)
func M512MaskCvtRoundpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)
func M512MaskCvtRoundpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)
func M512MaskCvtRoundpsPd(src x86.M512d, k x86.Mmask8, a x86.M256, sae int) (dst x86.M512d)
func M512MaskCvtRoundpsPh(src x86.M256i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)
func M512MaskCvtepi16Epi32(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)
func M512MaskCvtepi16Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskCvtepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)
func M512MaskCvtepi32Epi64(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)
func M512MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)
func M512MaskCvtepi32Pd(src x86.M512d, k x86.Mmask8, a x86.M256i) (dst x86.M512d)
func M512MaskCvtepi32Ps(src x86.M512, k x86.Mmask16, a x86.M512i) (dst x86.M512)
func M512MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskCvtepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)
func M512MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskCvtepi8Epi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)
func M512MaskCvtepi8Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskCvtepu16Epi32(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)
func M512MaskCvtepu16Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskCvtepu32Epi64(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)
func M512MaskCvtepu32Pd(src x86.M512d, k x86.Mmask8, a x86.M256i) (dst x86.M512d)
func M512MaskCvtepu32Ps(src x86.M512, k x86.Mmask16, a x86.M512i) (dst x86.M512)
func M512MaskCvtepu8Epi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)
func M512MaskCvtepu8Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskCvtpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)
func M512MaskCvtpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)
func M512MaskCvtpdPs(src x86.M256, k x86.Mmask8, a x86.M512d) (dst x86.M256)
func M512MaskCvtphPs(src x86.M512, k x86.Mmask16, a x86.M256i) (dst x86.M512)
func M512MaskCvtpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)
func M512MaskCvtpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)
func M512MaskCvtpsPd(src x86.M512d, k x86.Mmask8, a x86.M256) (dst x86.M512d)
func M512MaskCvtpsPh(src x86.M256i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)
func M512MaskCvtsepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)
func M512MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)
func M512MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskCvtsepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)
func M512MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskCvttRoundpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)
func M512MaskCvttRoundpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)
func M512MaskCvttRoundpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)
func M512MaskCvttRoundpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)
func M512MaskCvttpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)
func M512MaskCvttpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)
func M512MaskCvttpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)
func M512MaskCvttpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)
func M512MaskCvtusepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)
func M512MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)
func M512MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskCvtusepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)
func M512MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskDivEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskDivEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskDivPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskDivPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskDivRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MaskDivRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskErfPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskErfPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskErfcPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskErfcPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskErfcinvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskErfcinvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskErfinvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskErfinvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskExp10Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskExp10Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskExp2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskExp2Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskExpPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskExpPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskExpandEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)
func M512MaskExpandEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)
func M512MaskExpandPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskExpandPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskExpm1Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskExpm1Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskExtractf32x4Ps(src x86.M128, k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M128)
func M512MaskExtractf64x4Pd(src x86.M256d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M256d)
func M512MaskExtracti32x4Epi32(src x86.M128i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)
func M512MaskExtracti64x4Epi64(src x86.M256i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)
func M512MaskFixupimmPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)
func M512MaskFixupimmPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)
func M512MaskFixupimmRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)
func M512MaskFixupimmRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)
func M512MaskFloorPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskFloorPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskFmaddsubPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskFmaddsubPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskFmaddsubRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskFmaddsubRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskFmsubaddPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskFmsubaddPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskFmsubaddRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskFmsubaddRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskHypotPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskHypotPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskInsertf32x4(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)
func M512MaskInsertf64x4(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)
func M512MaskInserti32x4(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)
func M512MaskInserti64x4(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)
func M512MaskInvsqrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskInvsqrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskLog10Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskLog10Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskLog1pPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskLog1pPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskLog2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskLogPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskLogPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskLogbPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskLogbPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskMaxEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMaxEpu64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMaxPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskMaxPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskMaxRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
func M512MaskMaxRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)
func M512MaskMinEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMinEpu64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMinPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskMinPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskMinRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
func M512MaskMinRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)
func M512MaskMovedupPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskMovehdupPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskMoveldupPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskMulEpi32(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMulEpu32(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMulloxEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskNearbyintPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskNearbyintPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskPermutePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskPermutePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
func M512MaskPermutevarPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512i) (dst x86.M512d)
func M512MaskPermutevarPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512i) (dst x86.M512)
func M512MaskPermutex2varEpi32(a x86.M512i, k x86.Mmask16, idx x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskPermutex2varEpi64(a x86.M512i, k x86.Mmask8, idx x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskPermutex2varPd(a x86.M512d, k x86.Mmask8, idx x86.M512i, b x86.M512d) (dst x86.M512d)
func M512MaskPermutex2varPs(a x86.M512, k x86.Mmask16, idx x86.M512i, b x86.M512) (dst x86.M512)
func M512MaskPermutexEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskPermutexPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskPermutexvarEpi32(src x86.M512i, k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512MaskPermutexvarEpi64(src x86.M512i, k x86.Mmask8, idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512MaskPermutexvarPd(src x86.M512d, k x86.Mmask8, idx x86.M512i, a x86.M512d) (dst x86.M512d)
func M512MaskPermutexvarPs(src x86.M512, k x86.Mmask16, idx x86.M512i, a x86.M512) (dst x86.M512)
func M512MaskPowPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskPowPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskRcp14Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskRcp14Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskRecipPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskRecipPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskRemEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskRemEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskRintPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskRintPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskRolEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskRolEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskRolvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskRolvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskRorEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskRorEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskRorvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskRorvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskRoundscalePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskRoundscalePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
func M512MaskRoundscaleRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
func M512MaskRoundscaleRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)
func M512MaskRsqrt14Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskRsqrt14Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskScalefPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskScalefPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskScalefRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MaskScalefRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskSet1Epi32(src x86.M512i, k x86.Mmask16, a int) (dst x86.M512i)
func M512MaskSet1Epi64(src x86.M512i, k x86.Mmask8, a int64) (dst x86.M512i)
func M512MaskShuffleF32x4(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
func M512MaskShuffleF64x2(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskShuffleI32x4(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskShuffleI64x2(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskShufflePd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskShufflePs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
func M512MaskSinPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskSinPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskSincosPd(cos_res *x86.M512d, sin_src x86.M512d, cos_src x86.M512d, k x86.Mmask8, ...) (dst x86.M512d)
func M512MaskSincosPs(cos_res *x86.M512, sin_src x86.M512, cos_src x86.M512, k x86.Mmask16, ...) (dst x86.M512)
func M512MaskSindPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskSindPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskSinhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskSinhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskSllEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskSllEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskSlliEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSllvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskSqrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskSqrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskSqrtRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)
func M512MaskSqrtRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)
func M512MaskSraEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskSraEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskSraiEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSravEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskSrlEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskSrlEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskSrliEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSrlvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskSubEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskSvmlRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskTanPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskTanPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskTandPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskTandPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskTanhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskTanhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskTernarylogicEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskTernarylogicEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskTestnEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512MaskTruncPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskTruncPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskUnpackhiEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskUnpackhiEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskUnpackhiPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskUnpackhiPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskUnpackloEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskUnpackloEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskUnpackloPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskUnpackloPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzAbsEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)
func M512MaskzAbsEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)
func M512MaskzAddEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAddEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAddPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzAddPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzAddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzAddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskzAlignrEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
func M512MaskzAlignrEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
func M512MaskzAndEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAndEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAndnotEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAndnotEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzBroadcastF32x4(k x86.Mmask16, a x86.M128) (dst x86.M512)
func M512MaskzBroadcastF64x4(k x86.Mmask8, a x86.M256d) (dst x86.M512d)
func M512MaskzBroadcastI32x4(k x86.Mmask16, a x86.M128i) (dst x86.M512i)
func M512MaskzBroadcastI64x4(k x86.Mmask8, a x86.M256i) (dst x86.M512i)
func M512MaskzBroadcastdEpi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)
func M512MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskzBroadcastsdPd(k x86.Mmask8, a x86.M128d) (dst x86.M512d)
func M512MaskzBroadcastssPs(k x86.Mmask16, a x86.M128) (dst x86.M512)
func M512MaskzCompressEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)
func M512MaskzCompressEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)
func M512MaskzCompressPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskzCompressPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskzCvtRoundepi32Ps(k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)
func M512MaskzCvtRoundepu32Ps(k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)
func M512MaskzCvtRoundpdEpi32(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)
func M512MaskzCvtRoundpdEpu32(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)
func M512MaskzCvtRoundpdPs(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256)
func M512MaskzCvtRoundphPs(k x86.Mmask16, a x86.M256i, sae int) (dst x86.M512)
func M512MaskzCvtRoundpsEpi32(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)
func M512MaskzCvtRoundpsEpu32(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)
func M512MaskzCvtRoundpsPd(k x86.Mmask8, a x86.M256, sae int) (dst x86.M512d)
func M512MaskzCvtRoundpsPh(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)
func M512MaskzCvtepi16Epi32(k x86.Mmask16, a x86.M256i) (dst x86.M512i)
func M512MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskzCvtepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)
func M512MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M256i) (dst x86.M512i)
func M512MaskzCvtepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)
func M512MaskzCvtepi32Pd(k x86.Mmask8, a x86.M256i) (dst x86.M512d)
func M512MaskzCvtepi32Ps(k x86.Mmask16, a x86.M512i) (dst x86.M512)
func M512MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)
func M512MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskzCvtepi8Epi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)
func M512MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskzCvtepu16Epi32(k x86.Mmask16, a x86.M256i) (dst x86.M512i)
func M512MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M256i) (dst x86.M512i)
func M512MaskzCvtepu32Pd(k x86.Mmask8, a x86.M256i) (dst x86.M512d)
func M512MaskzCvtepu32Ps(k x86.Mmask16, a x86.M512i) (dst x86.M512)
func M512MaskzCvtepu8Epi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)
func M512MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskzCvtpdEpi32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)
func M512MaskzCvtpdEpu32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)
func M512MaskzCvtpdPs(k x86.Mmask8, a x86.M512d) (dst x86.M256)
func M512MaskzCvtphPs(k x86.Mmask16, a x86.M256i) (dst x86.M512)
func M512MaskzCvtpsEpi32(k x86.Mmask16, a x86.M512) (dst x86.M512i)
func M512MaskzCvtpsEpu32(k x86.Mmask16, a x86.M512) (dst x86.M512i)
func M512MaskzCvtpsPd(k x86.Mmask8, a x86.M256) (dst x86.M512d)
func M512MaskzCvtpsPh(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)
func M512MaskzCvtsepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)
func M512MaskzCvtsepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)
func M512MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)
func M512MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskzCvttRoundpdEpi32(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)
func M512MaskzCvttRoundpdEpu32(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)
func M512MaskzCvttRoundpsEpi32(k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)
func M512MaskzCvttRoundpsEpu32(k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)
func M512MaskzCvttpdEpi32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)
func M512MaskzCvttpdEpu32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)
func M512MaskzCvttpsEpi32(k x86.Mmask16, a x86.M512) (dst x86.M512i)
func M512MaskzCvttpsEpu32(k x86.Mmask16, a x86.M512) (dst x86.M512i)
func M512MaskzCvtusepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)
func M512MaskzCvtusepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)
func M512MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)
func M512MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)
func M512MaskzDivPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzDivPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzDivRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzDivRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskzExpandEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)
func M512MaskzExpandEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)
func M512MaskzExpandPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskzExpandPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskzExtractf32x4Ps(k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M128)
func M512MaskzExtractf64x4Pd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M256d)
func M512MaskzExtracti32x4Epi32(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)
func M512MaskzExtracti64x4Epi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)
func M512MaskzFixupimmPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)
func M512MaskzFixupimmPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)
func M512MaskzFixupimmRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)
func M512MaskzFixupimmRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)
func M512MaskzFmaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskzFmaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskzFmaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzFmaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskzFmaddsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskzFmaddsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskzFmaddsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzFmaddsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskzFmsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskzFmsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskzFmsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzFmsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskzFmsubaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskzFmsubaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskzFmsubaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzFmsubaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskzFnmaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskzFnmaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskzFnmaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzFnmaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskzFnmsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskzFnmsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskzFnmsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzFnmsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskzGetexpPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskzGetexpPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskzGetexpRoundPd(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzGetexpRoundPs(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)
func M512MaskzGetmantPd(k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512d)
func M512MaskzGetmantPs(k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512)
func M512MaskzGetmantRoundPd(k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, ...) (dst x86.M512d)
func M512MaskzGetmantRoundPs(k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, ...) (dst x86.M512)
func M512MaskzInsertf32x4(k x86.Mmask16, a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)
func M512MaskzInsertf64x4(k x86.Mmask8, a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)
func M512MaskzInserti32x4(k x86.Mmask16, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)
func M512MaskzInserti64x4(k x86.Mmask8, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)
func M512MaskzMaxEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMaxEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMaxEpu32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMaxEpu64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMaxPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzMaxPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzMaxRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
func M512MaskzMaxRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)
func M512MaskzMinEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMinEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMinEpu32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMinEpu64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMinPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzMinPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzMinRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
func M512MaskzMinRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)
func M512MaskzMovEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)
func M512MaskzMovEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)
func M512MaskzMovPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskzMovPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskzMovedupPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskzMovehdupPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskzMoveldupPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskzMulEpi32(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMulEpu32(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMulPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzMulPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzMulRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzMulRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskzMulloEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzOrEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzOrEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzPermutePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskzPermutePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
func M512MaskzPermutevarPd(k x86.Mmask8, a x86.M512d, b x86.M512i) (dst x86.M512d)
func M512MaskzPermutevarPs(k x86.Mmask16, a x86.M512, b x86.M512i) (dst x86.M512)
func M512MaskzPermutex2varEpi32(k x86.Mmask16, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzPermutex2varPd(k x86.Mmask8, a x86.M512d, idx x86.M512i, b x86.M512d) (dst x86.M512d)
func M512MaskzPermutex2varPs(k x86.Mmask16, a x86.M512, idx x86.M512i, b x86.M512) (dst x86.M512)
func M512MaskzPermutexEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzPermutexPd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskzPermutexvarEpi32(k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512MaskzPermutexvarEpi64(k x86.Mmask8, idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512MaskzPermutexvarPd(k x86.Mmask8, idx x86.M512i, a x86.M512d) (dst x86.M512d)
func M512MaskzPermutexvarPs(k x86.Mmask16, idx x86.M512i, a x86.M512) (dst x86.M512)
func M512MaskzRcp14Pd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskzRcp14Ps(k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskzRolEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzRolEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzRolvEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzRolvEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzRorEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzRorEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzRorvEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzRorvEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzRoundscalePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskzRoundscalePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
func M512MaskzRoundscaleRoundPd(k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
func M512MaskzRoundscaleRoundPs(k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)
func M512MaskzRsqrt14Pd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskzRsqrt14Ps(k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskzScalefPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzScalefPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzScalefRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzScalefRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskzSet1Epi32(k x86.Mmask16, a int) (dst x86.M512i)
func M512MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M512i)
func M512MaskzShuffleEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzShuffleF32x4(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
func M512MaskzShuffleF64x2(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskzShuffleI32x4(k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzShuffleI64x2(k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzShufflePd(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskzShufflePs(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
func M512MaskzSllEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskzSllEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskzSlliEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzSlliEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzSllvEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskzSllvEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskzSqrtPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskzSqrtPs(k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskzSqrtRoundPd(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzSqrtRoundPs(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)
func M512MaskzSraEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskzSraEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskzSraiEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzSraiEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzSravEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskzSravEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskzSrlEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskzSrlEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskzSrliEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzSrliEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzSrlvEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskzSrlvEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskzSubEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzSubEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzSubPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzSubPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzSubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MaskzSubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskzTernarylogicEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzUnpackhiEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzUnpackhiPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzUnpackhiPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzUnpackloEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzUnpackloEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzUnpackloPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzUnpackloPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzXorEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzXorEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaxEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaxEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaxPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaxPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512MaxRoundPd(a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
func M512MaxRoundPs(a x86.M512, b x86.M512, sae int) (dst x86.M512)
func M512MinEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MinEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MinPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MinPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512MinRoundPd(a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)
func M512MinRoundPs(a x86.M512, b x86.M512, sae int) (dst x86.M512)
func M512MovedupPd(a x86.M512d) (dst x86.M512d)
func M512MovehdupPs(a x86.M512) (dst x86.M512)
func M512MoveldupPs(a x86.M512) (dst x86.M512)
func M512MulEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MulEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MulloxEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512NearbyintPd(a x86.M512d) (dst x86.M512d)
func M512NearbyintPs(a x86.M512) (dst x86.M512)
func M512PermutePd(a x86.M512d, imm8 byte) (dst x86.M512d)
func M512PermutePs(a x86.M512, imm8 byte) (dst x86.M512)
func M512PermutevarPd(a x86.M512d, b x86.M512i) (dst x86.M512d)
func M512PermutevarPs(a x86.M512, b x86.M512i) (dst x86.M512)
func M512Permutex2varEpi32(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
func M512Permutex2varEpi64(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
func M512Permutex2varPd(a x86.M512d, idx x86.M512i, b x86.M512d) (dst x86.M512d)
func M512Permutex2varPs(a x86.M512, idx x86.M512i, b x86.M512) (dst x86.M512)
func M512PermutexEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512PermutexPd(a x86.M512d, imm8 byte) (dst x86.M512d)
func M512PermutexvarEpi32(idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512PermutexvarEpi64(idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512PermutexvarPd(idx x86.M512i, a x86.M512d) (dst x86.M512d)
func M512PermutexvarPs(idx x86.M512i, a x86.M512) (dst x86.M512)
func M512PowPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512PowPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512Rcp14Pd(a x86.M512d) (dst x86.M512d)
func M512Rcp14Ps(a x86.M512) (dst x86.M512)
func M512RecipPd(a x86.M512d) (dst x86.M512d)
func M512RecipPs(a x86.M512) (dst x86.M512)
func M512RemEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RemEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RemEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RemEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RemEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RemEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RemEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RemEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RintPd(a x86.M512d) (dst x86.M512d)
func M512RintPs(a x86.M512) (dst x86.M512)
func M512RolEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512RolEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512RolvEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RolvEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RorEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512RorEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512RorvEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RorvEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512RoundscalePd(a x86.M512d, imm8 byte) (dst x86.M512d)
func M512RoundscalePs(a x86.M512, imm8 byte) (dst x86.M512)
func M512RoundscaleRoundPd(a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
func M512RoundscaleRoundPs(a x86.M512, imm8 byte, rounding int) (dst x86.M512)
func M512Rsqrt14Pd(a x86.M512d) (dst x86.M512d)
func M512Rsqrt14Ps(a x86.M512) (dst x86.M512)
func M512ScalefPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512ScalefPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512ScalefRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512ScalefRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512Set1Epi16(a int16) (dst x86.M512i)
func M512Set1Epi32(a int) (dst x86.M512i)
func M512Set1Epi64(a int64) (dst x86.M512i)
func M512Set1Epi8(a byte) (dst x86.M512i)
func M512Set1Pd(a float64) (dst x86.M512d)
func M512Set1Ps(a float32) (dst x86.M512)
func M512Set4Epi32(d int, c int, b int, a int) (dst x86.M512i)
func M512Set4Epi64(d int64, c int64, b int64, a int64) (dst x86.M512i)
func M512Set4Pd(d float64, c float64, b float64, a float64) (dst x86.M512d)
func M512Set4Ps(d float32, c float32, b float32, a float32) (dst x86.M512)
func M512SetEpi32(e15 int, e14 int, e13 int, e12 int, e11 int, e10 int, e9 int, e8 int, e7 int, ...) (dst x86.M512i)
func M512SetEpi64(e7 int64, e6 int64, e5 int64, e4 int64, e3 int64, e2 int64, e1 int64, e0 int64) (dst x86.M512i)
func M512SetPd(e7 float64, e6 float64, e5 float64, e4 float64, e3 float64, e2 float64, ...) (dst x86.M512d)
func M512SetPs(e15 float32, e14 float32, e13 float32, e12 float32, e11 float32, e10 float32, ...) (dst x86.M512)
func M512Setr4Epi32(d int, c int, b int, a int) (dst x86.M512i)
func M512Setr4Epi64(d int64, c int64, b int64, a int64) (dst x86.M512i)
func M512Setr4Pd(d float64, c float64, b float64, a float64) (dst x86.M512d)
func M512Setr4Ps(d float32, c float32, b float32, a float32) (dst x86.M512)
func M512SetrEpi32(e15 int, e14 int, e13 int, e12 int, e11 int, e10 int, e9 int, e8 int, e7 int, ...) (dst x86.M512i)
func M512SetrEpi64(e7 int64, e6 int64, e5 int64, e4 int64, e3 int64, e2 int64, e1 int64, e0 int64) (dst x86.M512i)
func M512SetrPd(e7 float64, e6 float64, e5 float64, e4 float64, e3 float64, e2 float64, ...) (dst x86.M512d)
func M512SetrPs(e15 float32, e14 float32, e13 float32, e12 float32, e11 float32, e10 float32, ...) (dst x86.M512)
func M512Setzero() (dst x86.M512)
func M512SetzeroEpi32() (dst x86.M512i)
func M512SetzeroPd() (dst x86.M512d)
func M512SetzeroPs() (dst x86.M512)
func M512SetzeroSi512() (dst x86.M512i)
func M512ShuffleF32x4(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
func M512ShuffleF64x2(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
func M512ShuffleI32x4(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512ShuffleI64x2(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512ShufflePd(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
func M512ShufflePs(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
func M512SinPd(a x86.M512d) (dst x86.M512d)
func M512SinPs(a x86.M512) (dst x86.M512)
func M512SincosPd(cos_res *x86.M512d, a x86.M512d) (dst x86.M512d)
func M512SincosPs(cos_res *x86.M512, a x86.M512) (dst x86.M512)
func M512SindPd(a x86.M512d) (dst x86.M512d)
func M512SindPs(a x86.M512) (dst x86.M512)
func M512SinhPd(a x86.M512d) (dst x86.M512d)
func M512SinhPs(a x86.M512) (dst x86.M512)
func M512SllEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512SllEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512SlliEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SllvEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512SqrtPd(a x86.M512d) (dst x86.M512d)
func M512SqrtPs(a x86.M512) (dst x86.M512)
func M512SqrtRoundPd(a x86.M512d, rounding int) (dst x86.M512d)
func M512SqrtRoundPs(a x86.M512, rounding int) (dst x86.M512)
func M512SraEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512SraEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512SraiEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SravEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512SrlEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512SrlEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512SrliEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SrlvEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512SubEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512SvmlRoundPd(a x86.M512d) (dst x86.M512d)
func M512TanPd(a x86.M512d) (dst x86.M512d)
func M512TanPs(a x86.M512) (dst x86.M512)
func M512TandPd(a x86.M512d) (dst x86.M512d)
func M512TandPs(a x86.M512) (dst x86.M512)
func M512TanhPd(a x86.M512d) (dst x86.M512d)
func M512TanhPs(a x86.M512) (dst x86.M512)
func M512TernarylogicEpi32(a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)
func M512TernarylogicEpi64(a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)
func M512TestEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512TestnEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512TestnEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)
func M512TruncPd(a x86.M512d) (dst x86.M512d)
func M512TruncPs(a x86.M512) (dst x86.M512)
func M512Undefined() (dst x86.M512)
func M512UndefinedEpi32() (dst x86.M512i)
func M512UndefinedPd() (dst x86.M512d)
func M512UndefinedPs() (dst x86.M512)
func M512UnpackhiEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512UnpackhiEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512UnpackhiPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512UnpackhiPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512UnpackloEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512UnpackloEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512UnpackloPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512UnpackloPs(a x86.M512, b x86.M512) (dst x86.M512)
func Mask2Permutex2varEpi32(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)
func Mask2Permutex2varEpi64(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)
func Mask2Permutex2varPd(a x86.M128d, idx x86.M128i, k x86.Mmask8, b x86.M128d) (dst x86.M128d)
func Mask2Permutex2varPs(a x86.M128, idx x86.M128i, k x86.Mmask8, b x86.M128) (dst x86.M128)
func Mask3FmaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
func Mask3FmaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
func Mask3FmaddRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
func Mask3FmaddRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
func Mask3FmaddSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
func Mask3FmaddSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
func Mask3FmaddsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
func Mask3FmaddsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
func Mask3FmsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
func Mask3FmsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
func Mask3FmsubRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
func Mask3FmsubRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
func Mask3FmsubSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
func Mask3FmsubSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
func Mask3FmsubaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
func Mask3FmsubaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
func Mask3FnmaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
func Mask3FnmaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
func Mask3FnmaddRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
func Mask3FnmaddRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
func Mask3FnmaddSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
func Mask3FnmaddSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
func Mask3FnmsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
func Mask3FnmsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
func Mask3FnmsubRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)
func Mask3FnmsubRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)
func Mask3FnmsubSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)
func Mask3FnmsubSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)
func MaskAbsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskAbsEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskAddEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAddEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAddRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskAddRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskAddSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskAddSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskAndEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAndEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAndnotEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAndnotEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskBlendEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskBlendEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskBlendPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskBlendPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskBroadcastdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskBroadcastqEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskBroadcastssPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskCmpEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func MaskCmpEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func MaskCmpPdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)
func MaskCmpPsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)
func MaskCmpRoundSdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, sae int) (dst x86.Mmask8)
func MaskCmpRoundSsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, sae int) (dst x86.Mmask8)
func MaskCmpSdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)
func MaskCmpSsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)
func MaskCmpeqEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpeqEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgeEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgeEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgtEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgtEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpleEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpleEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpltEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpltEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpneqEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpneqEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCompressEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCompressEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCompressPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskCompressPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskCvtRoundpsPh(src x86.M128i, k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)
func MaskCvtRoundsdSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128d, rounding int) (dst x86.M128)
func MaskCvtRoundssSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)
func MaskCvtepi16Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi16Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi32Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi32Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)
func MaskCvtepi32Ps(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)
func MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi8Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi8Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepu16Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepu16Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepu32Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepu32Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)
func MaskCvtepu8Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepu8Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskCvtpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskCvtpdPs(src x86.M128, k x86.Mmask8, a x86.M128d) (dst x86.M128)
func MaskCvtphPs(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)
func MaskCvtpsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskCvtpsEpu32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskCvtpsPh(src x86.M128i, k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)
func MaskCvtsdSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128d) (dst x86.M128)
func MaskCvtsepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtsepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtssSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128) (dst x86.M128d)
func MaskCvttpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskCvttpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskCvttpsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskCvttpsEpu32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskCvtusepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtusepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskDivPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskDivPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskDivRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskDivRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskDivSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskDivSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskExpandEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskExpandEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskExpandPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskExpandPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskFixupimmPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
func MaskFixupimmPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
func MaskFixupimmRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)
func MaskFixupimmRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)
func MaskFixupimmSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
func MaskFixupimmSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
func MaskFmaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskFmaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
func MaskFmaddRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
func MaskFmaddRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
func MaskFmaddSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskFmaddSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
func MaskFmaddsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskFmaddsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
func MaskFmsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskFmsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
func MaskFmsubRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
func MaskFmsubRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
func MaskFmsubSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskFmsubSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
func MaskFmsubaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskFmsubaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
func MaskFnmaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskFnmaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
func MaskFnmaddRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
func MaskFnmaddRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
func MaskFnmaddSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskFnmaddSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
func MaskFnmsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskFnmsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
func MaskFnmsubRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
func MaskFnmsubRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
func MaskFnmsubSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskFnmsubSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)
func MaskGetexpPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskGetexpPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskGetexpRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskGetexpRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskGetexpSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskGetexpSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskGetmantPd(src x86.M128d, k x86.Mmask8, a x86.M128d, interv MMMANTISSANORMENUM, ...) (dst x86.M128d)
func MaskGetmantPs(src x86.M128, k x86.Mmask8, a x86.M128, interv MMMANTISSANORMENUM, ...) (dst x86.M128)
func MaskGetmantRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, ...) (dst x86.M128d)
func MaskGetmantRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, ...) (dst x86.M128)
func MaskGetmantSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, ...) (dst x86.M128d)
func MaskGetmantSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, ...) (dst x86.M128)
func MaskLoadSd(src x86.M128d, k x86.Mmask8, mem_addr *float64) (dst x86.M128d)
func MaskLoadSs(src x86.M128, k x86.Mmask8, mem_addr *float32) (dst x86.M128)
func MaskMaxEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMaxEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMaxEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMaxEpu64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMaxPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskMaxPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskMaxRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
func MaskMaxRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)
func MaskMaxSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskMaxSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskMinEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMinEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMinEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMinEpu64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMinPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskMinPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskMinRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
func MaskMinRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)
func MaskMinSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskMinSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskMovEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskMovEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskMovPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskMovPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskMoveSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskMoveSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskMovedupPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskMovehdupPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskMoveldupPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskMulEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMulEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMulPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskMulPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskMulRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskMulRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskMulSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskMulSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskMulloEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskOrEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskOrEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskPermutePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)
func MaskPermutePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)
func MaskPermutevarPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128i) (dst x86.M128d)
func MaskPermutevarPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128i) (dst x86.M128)
func MaskPermutex2varEpi32(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskPermutex2varEpi64(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskPermutex2varPd(a x86.M128d, k x86.Mmask8, idx x86.M128i, b x86.M128d) (dst x86.M128d)
func MaskPermutex2varPs(a x86.M128, k x86.Mmask8, idx x86.M128i, b x86.M128) (dst x86.M128)
func MaskRcp14Pd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskRcp14Ps(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskRcp14Sd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskRcp14Ss(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskRolEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskRolEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskRolvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskRolvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskRorEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskRorEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskRorvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskRorvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskRoundscalePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)
func MaskRoundscalePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)
func MaskRoundscaleRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
func MaskRoundscaleRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
func MaskRoundscaleSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func MaskRoundscaleSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func MaskRsqrt14Pd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskRsqrt14Ps(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskRsqrt14Sd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskRsqrt14Ss(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskScalefPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskScalefPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskScalefRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskScalefRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskScalefSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskScalefSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskSet1Epi32(src x86.M128i, k x86.Mmask8, a int) (dst x86.M128i)
func MaskSet1Epi64(src x86.M128i, k x86.Mmask8, a int64) (dst x86.M128i)
func MaskShuffleEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskShufflePd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func MaskShufflePs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func MaskSllEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSllEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSlliEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskSlliEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskSllvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSllvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSqrtPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskSqrtPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskSqrtRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskSqrtRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskSqrtSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskSqrtSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskSraEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSraEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSraiEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskSraiEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskSravEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSravEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSrlEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSrlEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSrliEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskSrliEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskSrlvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSrlvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskStoreSd(mem_addr *float64, k x86.Mmask8, a x86.M128d)
func MaskStoreSs(mem_addr *float32, k x86.Mmask8, a x86.M128)
func MaskSubEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskSubEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskSubPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskSubPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskSubRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskSubRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskSubSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskSubSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskTernarylogicEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)
func MaskTernarylogicEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)
func MaskTestEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskTestnEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskUnpackhiEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskUnpackhiEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskUnpackhiPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskUnpackhiPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskUnpackloEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskUnpackloEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskUnpackloPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskUnpackloPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskXorEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskXorEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAbsEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzAbsEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzAddEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAddEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskzAddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskzAddSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzAddSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzAndEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAndEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAndnotEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAndnotEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzBroadcastdEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzBroadcastssPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskzCompressEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCompressEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCompressPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskzCompressPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskzCvtRoundpsPh(k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)
func MaskzCvtRoundsdSs(k x86.Mmask8, a x86.M128, b x86.M128d, rounding int) (dst x86.M128)
func MaskzCvtRoundssSd(k x86.Mmask8, a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)
func MaskzCvtepi16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)
func MaskzCvtepi32Ps(k x86.Mmask8, a x86.M128i) (dst x86.M128)
func MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepu16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepu32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)
func MaskzCvtepu8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtpdEpi32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskzCvtpdEpu32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskzCvtpdPs(k x86.Mmask8, a x86.M128d) (dst x86.M128)
func MaskzCvtphPs(k x86.Mmask8, a x86.M128i) (dst x86.M128)
func MaskzCvtpsEpi32(k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskzCvtpsEpu32(k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskzCvtpsPh(k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)
func MaskzCvtsdSs(k x86.Mmask8, a x86.M128, b x86.M128d) (dst x86.M128)
func MaskzCvtsepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtsepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtssSd(k x86.Mmask8, a x86.M128d, b x86.M128) (dst x86.M128d)
func MaskzCvttpdEpi32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskzCvttpdEpu32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskzCvttpsEpi32(k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskzCvttpsEpu32(k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskzCvtusepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtusepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzDivPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzDivPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzDivRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskzDivRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskzDivSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzDivSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzExpandEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzExpandEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzExpandPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskzExpandPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskzFixupimmPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
func MaskzFixupimmPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
func MaskzFixupimmRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)
func MaskzFixupimmRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)
func MaskzFixupimmSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)
func MaskzFixupimmSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)
func MaskzFmaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskzFmaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
func MaskzFmaddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
func MaskzFmaddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
func MaskzFmaddSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskzFmaddSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
func MaskzFmaddsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskzFmaddsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
func MaskzFmsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskzFmsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
func MaskzFmsubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
func MaskzFmsubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
func MaskzFmsubSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskzFmsubSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
func MaskzFmsubaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskzFmsubaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
func MaskzFnmaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskzFnmaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
func MaskzFnmaddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
func MaskzFnmaddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
func MaskzFnmaddSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskzFnmaddSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
func MaskzFnmsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskzFnmsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
func MaskzFnmsubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)
func MaskzFnmsubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)
func MaskzFnmsubSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)
func MaskzFnmsubSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)
func MaskzGetexpPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskzGetexpPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskzGetexpRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskzGetexpRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskzGetexpSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzGetexpSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzGetmantPd(k x86.Mmask8, a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)
func MaskzGetmantPs(k x86.Mmask8, a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)
func MaskzGetmantRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, ...) (dst x86.M128d)
func MaskzGetmantRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, ...) (dst x86.M128)
func MaskzGetmantSd(k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, ...) (dst x86.M128d)
func MaskzGetmantSs(k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, ...) (dst x86.M128)
func MaskzLoadSd(k x86.Mmask8, mem_addr *float64) (dst x86.M128d)
func MaskzLoadSs(k x86.Mmask8, mem_addr *float32) (dst x86.M128)
func MaskzMaxEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMaxEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMaxEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMaxEpu64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMaxPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzMaxPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzMaxRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
func MaskzMaxRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)
func MaskzMaxSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzMaxSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzMinEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMinEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMinEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMinEpu64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMinPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzMinPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzMinRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
func MaskzMinRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)
func MaskzMinSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzMinSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzMovEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzMovEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzMovPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskzMovPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskzMoveSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzMoveSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzMovedupPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskzMovehdupPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskzMoveldupPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskzMulEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMulEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMulPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzMulPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzMulRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskzMulRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskzMulSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzMulSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzMulloEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzOrEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzOrEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzPermutePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)
func MaskzPermutePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)
func MaskzPermutevarPd(k x86.Mmask8, a x86.M128d, b x86.M128i) (dst x86.M128d)
func MaskzPermutevarPs(k x86.Mmask8, a x86.M128, b x86.M128i) (dst x86.M128)
func MaskzPermutex2varEpi32(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzPermutex2varPd(k x86.Mmask8, a x86.M128d, idx x86.M128i, b x86.M128d) (dst x86.M128d)
func MaskzPermutex2varPs(k x86.Mmask8, a x86.M128, idx x86.M128i, b x86.M128) (dst x86.M128)
func MaskzRcp14Pd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskzRcp14Ps(k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskzRcp14Sd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzRcp14Ss(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzRolEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzRolEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzRolvEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzRolvEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzRorEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzRorEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzRorvEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzRorvEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzRoundscalePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)
func MaskzRoundscalePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)
func MaskzRoundscaleRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
func MaskzRoundscaleRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
func MaskzRoundscaleSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func MaskzRoundscaleSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func MaskzRsqrt14Pd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskzRsqrt14Ps(k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskzRsqrt14Sd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzRsqrt14Ss(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzScalefPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzScalefPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzScalefRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskzScalefRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskzScalefSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzScalefSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzSet1Epi32(k x86.Mmask8, a int) (dst x86.M128i)
func MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M128i)
func MaskzShuffleEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzShufflePd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func MaskzShufflePs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func MaskzSllEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSllEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSlliEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzSlliEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzSllvEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSllvEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSqrtPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)
func MaskzSqrtPs(k x86.Mmask8, a x86.M128) (dst x86.M128)
func MaskzSqrtRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskzSqrtRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskzSqrtSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzSqrtSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzSraEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSraEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSraiEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzSraiEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzSravEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSravEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSrlEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSrlEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSrliEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzSrliEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzSrlvEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSrlvEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSubEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzSubEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzSubPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzSubPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzSubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MaskzSubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func MaskzSubSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzSubSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzTernarylogicEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzUnpackhiEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzUnpackhiPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzUnpackhiPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzUnpackloEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzUnpackloEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzUnpackloPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzUnpackloPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzXorEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzXorEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaxEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaxEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaxRoundSd(a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
func MaxRoundSs(a x86.M128, b x86.M128, sae int) (dst x86.M128)
func MinEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func MinEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func MinRoundSd(a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)
func MinRoundSs(a x86.M128, b x86.M128, sae int) (dst x86.M128)
func MulRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func MulRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func Permutex2varEpi32(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)
func Permutex2varEpi64(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)
func Permutex2varPd(a x86.M128d, idx x86.M128i, b x86.M128d) (dst x86.M128d)
func Permutex2varPs(a x86.M128, idx x86.M128i, b x86.M128) (dst x86.M128)
func Rcp14Pd(a x86.M128d) (dst x86.M128d)
func Rcp14Ps(a x86.M128) (dst x86.M128)
func Rcp14Sd(a x86.M128d, b x86.M128d) (dst x86.M128d)
func Rcp14Ss(a x86.M128, b x86.M128) (dst x86.M128)
func RolEpi32(a x86.M128i, imm8 byte) (dst x86.M128i)
func RolEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)
func RolvEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RolvEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RorEpi32(a x86.M128i, imm8 byte) (dst x86.M128i)
func RorEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)
func RorvEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RorvEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RoundscalePd(a x86.M128d, imm8 byte) (dst x86.M128d)
func RoundscalePs(a x86.M128, imm8 byte) (dst x86.M128)
func RoundscaleRoundSd(a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
func RoundscaleRoundSs(a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
func RoundscaleSd(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func RoundscaleSs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func Rsqrt14Sd(a x86.M128d, b x86.M128d) (dst x86.M128d)
func Rsqrt14Ss(a x86.M128, b x86.M128) (dst x86.M128)
func ScalefPd(a x86.M128d, b x86.M128d) (dst x86.M128d)
func ScalefPs(a x86.M128, b x86.M128) (dst x86.M128)
func ScalefRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func ScalefRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func ScalefSd(a x86.M128d, b x86.M128d) (dst x86.M128d)
func ScalefSs(a x86.M128, b x86.M128) (dst x86.M128)
func SqrtRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func SqrtRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func SraEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)
func SraiEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)
func SravEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)
func SubRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)
func SubRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)
func TernarylogicEpi32(a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)
func TernarylogicEpi64(a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)
func TestEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func TestEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func TestnEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func TestnEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func AbsEpi64 ¶

func AbsEpi64(a x86.M128i) (dst x86.M128i)

AbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm_abs_epi64'. Requires AVX512F.

func AddRoundSd ¶

func AddRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

AddRoundSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := a[63:0] + b[63:0]
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VADDSD'. Intrinsic: '_mm_add_round_sd'. Requires AVX512F.

func AddRoundSs ¶

func AddRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

AddRoundSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := a[31:0] + b[31:0]
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VADDSS'. Intrinsic: '_mm_add_round_ss'. Requires AVX512F.

func CmpEpi32Mask ¶

func CmpEpi32Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmp_epi32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpEpi64Mask ¶

func CmpEpi64Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpEpu32Mask ¶

func CmpEpu32Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmp_epu32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpEpu64Mask ¶

func CmpEpu64Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpPdMask ¶

func CmpPdMask(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)

CmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm_cmp_pd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpPsMask ¶

func CmpPsMask(a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)

CmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm_cmp_ps_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpRoundSdMask ¶

func CmpRoundSdMask(a x86.M128d, b x86.M128d, imm8 byte, sae int) (dst x86.Mmask8)

CmpRoundSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0

	k[MAX:1] := 0

Instruction: 'VCMPSD'. Intrinsic: '_mm_cmp_round_sd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpRoundSsMask ¶

func CmpRoundSsMask(a x86.M128, b x86.M128, imm8 byte, sae int) (dst x86.Mmask8)

CmpRoundSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0

	k[MAX:1] := 0

Instruction: 'VCMPSS'. Intrinsic: '_mm_cmp_round_ss_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpSdMask ¶

func CmpSdMask(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)

CmpSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC

k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0

k[MAX:1] := 0

Instruction: 'VCMPSD'. Intrinsic: '_mm_cmp_sd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpSsMask ¶

func CmpSsMask(a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)

CmpSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC

k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0

k[MAX:1] := 0

Instruction: 'VCMPSS'. Intrinsic: '_mm_cmp_ss_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpeqEpi32Mask ¶

func CmpeqEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpeq_epi32_mask'. Requires AVX512F.

func CmpeqEpi64Mask ¶

func CmpeqEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpeq_epi64_mask'. Requires AVX512F.

func CmpeqEpu32Mask ¶

func CmpeqEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpeq_epu32_mask'. Requires AVX512F.

func CmpeqEpu64Mask ¶

func CmpeqEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpeq_epu64_mask'. Requires AVX512F.

func CmpgeEpi32Mask ¶

func CmpgeEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpge_epi32_mask'. Requires AVX512F.

func CmpgeEpi64Mask ¶

func CmpgeEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpge_epi64_mask'. Requires AVX512F.

func CmpgeEpu32Mask ¶

func CmpgeEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpge_epu32_mask'. Requires AVX512F.

func CmpgeEpu64Mask ¶

func CmpgeEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpge_epu64_mask'. Requires AVX512F.

func CmpgtEpi32Mask ¶

func CmpgtEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpgt_epi32_mask'. Requires AVX512F.

func CmpgtEpi64Mask ¶

func CmpgtEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpgt_epi64_mask'. Requires AVX512F.

func CmpgtEpu32Mask ¶

func CmpgtEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpgt_epu32_mask'. Requires AVX512F.

func CmpgtEpu64Mask ¶

func CmpgtEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpgt_epu64_mask'. Requires AVX512F.

func CmpleEpi32Mask ¶

func CmpleEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmple_epi32_mask'. Requires AVX512F.

func CmpleEpi64Mask ¶

func CmpleEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmple_epi64_mask'. Requires AVX512F.

func CmpleEpu32Mask ¶

func CmpleEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmple_epu32_mask'. Requires AVX512F.

func CmpleEpu64Mask ¶

func CmpleEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmple_epu64_mask'. Requires AVX512F.

func CmpltEpi32Mask ¶

func CmpltEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmplt_epi32_mask'. Requires AVX512F.

func CmpltEpi64Mask ¶

func CmpltEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmplt_epi64_mask'. Requires AVX512F.

func CmpltEpu32Mask ¶

func CmpltEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmplt_epu32_mask'. Requires AVX512F.

func CmpltEpu64Mask ¶

func CmpltEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmplt_epu64_mask'. Requires AVX512F.

func CmpneqEpi32Mask ¶

func CmpneqEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpneq_epi32_mask'. Requires AVX512F.

func CmpneqEpi64Mask ¶

func CmpneqEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpneq_epi64_mask'. Requires AVX512F.

func CmpneqEpu32Mask ¶

func CmpneqEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpneq_epu32_mask'. Requires AVX512F.

func CmpneqEpu64Mask ¶

func CmpneqEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpneq_epu64_mask'. Requires AVX512F.

func ComiRoundSd ¶

func ComiRoundSd(a x86.M128d, b x86.M128d, imm8 byte, sae int) int

ComiRoundSd: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and return the boolean result (0 or 1).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0

Instruction: 'VCOMISD'. Intrinsic: '_mm_comi_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func ComiRoundSs ¶

func ComiRoundSs(a x86.M128, b x86.M128, imm8 byte, sae int) int

ComiRoundSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and return the boolean result (0 or 1).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0

Instruction: 'VCOMISS'. Intrinsic: '_mm_comi_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CvtRoundi32Ss ¶

func CvtRoundi32Ss(a x86.M128, b int, rounding int) (dst x86.M128)

CvtRoundi32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_Int32_To_FP32(b[31:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundi32_ss'. Requires AVX512F.

func CvtRoundi64Sd ¶

func CvtRoundi64Sd(a x86.M128d, b int64, rounding int) (dst x86.M128d)

CvtRoundi64Sd: Convert the 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_Int64_To_FP64(b[63:0])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvt_roundi64_sd'. Requires AVX512F.

func CvtRoundi64Ss ¶

func CvtRoundi64Ss(a x86.M128, b int64, rounding int) (dst x86.M128)

CvtRoundi64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_Int64_To_FP32(b[63:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundi64_ss'. Requires AVX512F.

func CvtRoundsdI32 ¶

func CvtRoundsdI32(a x86.M128d, rounding int) int

CvtRoundsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_Int32(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_i32'. Requires AVX512F.

func CvtRoundsdI64 ¶

func CvtRoundsdI64(a x86.M128d, rounding int) int64

CvtRoundsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_Int64(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_i64'. Requires AVX512F.

func CvtRoundsdSi32 ¶

func CvtRoundsdSi32(a x86.M128d, rounding int) int

CvtRoundsdSi32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_Int32(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_si32'. Requires AVX512F.

func CvtRoundsdSi64 ¶

func CvtRoundsdSi64(a x86.M128d, rounding int) int64

CvtRoundsdSi64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_Int64(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_si64'. Requires AVX512F.

func CvtRoundsdSs ¶

func CvtRoundsdSs(a x86.M128, b x86.M128d, rounding int) (dst x86.M128)

CvtRoundsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_FP32(b[63:0])
		dst[127:32] := a[127:31]
		dst[MAX:64] := 0

Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_cvt_roundsd_ss'. Requires AVX512F.

func CvtRoundsdU32 ¶

func CvtRoundsdU32(a x86.M128d, rounding int) uint32

CvtRoundsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])

Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvt_roundsd_u32'. Requires AVX512F.

func CvtRoundsdU64 ¶

func CvtRoundsdU64(a x86.M128d, rounding int) uint64

CvtRoundsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])

Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvt_roundsd_u64'. Requires AVX512F.

func CvtRoundsi32Ss ¶

func CvtRoundsi32Ss(a x86.M128, b int, rounding int) (dst x86.M128)

CvtRoundsi32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_Int32_To_FP32(b[31:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundsi32_ss'. Requires AVX512F.

func CvtRoundsi64Sd ¶

func CvtRoundsi64Sd(a x86.M128d, b int64, rounding int) (dst x86.M128d)

CvtRoundsi64Sd: Convert the 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_Int64_To_FP64(b[63:0])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvt_roundsi64_sd'. Requires AVX512F.

func CvtRoundsi64Ss ¶

func CvtRoundsi64Ss(a x86.M128, b int64, rounding int) (dst x86.M128)

CvtRoundsi64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_Int64_To_FP32(b[63:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundsi64_ss'. Requires AVX512F.

func CvtRoundssI32 ¶

func CvtRoundssI32(a x86.M128, rounding int) int

CvtRoundssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_Int32(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_i32'. Requires AVX512F.

func CvtRoundssI64 ¶

func CvtRoundssI64(a x86.M128, rounding int) int64

CvtRoundssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_Int64(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_i64'. Requires AVX512F.

func CvtRoundssSd ¶

func CvtRoundssSd(a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)

CvtRoundssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_FP64(b[31:0])
		dst[127:64] := a[127:64]
		dst[MAX:64] := 0

Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_cvt_roundss_sd'. Requires AVX512F.

func CvtRoundssSi32 ¶

func CvtRoundssSi32(a x86.M128, rounding int) int

CvtRoundssSi32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_Int32(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_si32'. Requires AVX512F.

func CvtRoundssSi64 ¶

func CvtRoundssSi64(a x86.M128, rounding int) int64

CvtRoundssSi64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_Int64(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_si64'. Requires AVX512F.

func CvtRoundssU32 ¶

func CvtRoundssU32(a x86.M128, rounding int) uint32

CvtRoundssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])

Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvt_roundss_u32'. Requires AVX512F.

func CvtRoundssU64 ¶

func CvtRoundssU64(a x86.M128, rounding int) uint64

CvtRoundssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])

Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvt_roundss_u64'. Requires AVX512F.

func CvtRoundu32Ss ¶

func CvtRoundu32Ss(a x86.M128, b uint32, rounding int) (dst x86.M128)

CvtRoundu32Ss: Convert the unsigned 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvt_roundu32_ss'. Requires AVX512F.

func CvtRoundu64Sd ¶

func CvtRoundu64Sd(a x86.M128d, b uint64, rounding int) (dst x86.M128d)

CvtRoundu64Sd: Convert the unsigned 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VCVTUSI2SD'. Intrinsic: '_mm_cvt_roundu64_sd'. Requires AVX512F.

func CvtRoundu64Ss ¶

func CvtRoundu64Ss(a x86.M128, b uint64, rounding int) (dst x86.M128)

CvtRoundu64Ss: Convert the unsigned 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvt_roundu64_ss'. Requires AVX512F.

func Cvtepi32Epi16 ¶

func Cvtepi32Epi16(a x86.M128i) (dst x86.M128i)

Cvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 16*j
	dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm_cvtepi32_epi16'. Requires AVX512F.

func Cvtepi32Epi8 ¶

func Cvtepi32Epi8(a x86.M128i) (dst x86.M128i)

Cvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 8*j
	dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm_cvtepi32_epi8'. Requires AVX512F.

func Cvtepi64Epi16 ¶

func Cvtepi64Epi16(a x86.M128i) (dst x86.M128i)

Cvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 16*j
	dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm_cvtepi64_epi16'. Requires AVX512F.

func Cvtepi64Epi32 ¶

func Cvtepi64Epi32(a x86.M128i) (dst x86.M128i)

Cvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 32*j
	dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm_cvtepi64_epi32'. Requires AVX512F.

func Cvtepi64Epi8 ¶

func Cvtepi64Epi8(a x86.M128i) (dst x86.M128i)

Cvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 8*j
	dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm_cvtepi64_epi8'. Requires AVX512F.

func Cvtepu32Pd ¶

func Cvtepu32Pd(a x86.M128i) (dst x86.M128d)

Cvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm_cvtepu32_pd'. Requires AVX512F.

func Cvti32Sd ¶

func Cvti32Sd(a x86.M128d, b int) (dst x86.M128d)

Cvti32Sd: Convert the 32-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[63:0] := Convert_Int32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvti32_sd'. Requires AVX512F.

func Cvti32Ss ¶

func Cvti32Ss(a x86.M128, b int) (dst x86.M128)

Cvti32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvti32_ss'. Requires AVX512F.

func Cvti64Sd ¶

func Cvti64Sd(a x86.M128d, b int64) (dst x86.M128d)

Cvti64Sd: Convert the 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[63:0] := Convert_Int64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvti64_sd'. Requires AVX512F.

func Cvti64Ss ¶

func Cvti64Ss(a x86.M128, b int64) (dst x86.M128)

Cvti64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvti64_ss'. Requires AVX512F.

func CvtpdEpu32 ¶

func CvtpdEpu32(a x86.M128d) (dst x86.M128i)

CvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm_cvtpd_epu32'. Requires AVX512F.

func CvtpsEpu32 ¶

func CvtpsEpu32(a x86.M128) (dst x86.M128i)

CvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm_cvtps_epu32'. Requires AVX512F.

func CvtsdI32 ¶

func CvtsdI32(a x86.M128d) int

CvtsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP64_To_Int32(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvtsd_i32'. Requires AVX512F.

func CvtsdI64 ¶

func CvtsdI64(a x86.M128d) int64

CvtsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

dst[63:0] := Convert_FP64_To_Int64(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvtsd_i64'. Requires AVX512F.

func CvtsdU32 ¶

func CvtsdU32(a x86.M128d) uint32

CvtsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])

Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvtsd_u32'. Requires AVX512F.

func CvtsdU64 ¶

func CvtsdU64(a x86.M128d) uint64

CvtsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.

dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])

Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvtsd_u64'. Requires AVX512F.

func Cvtsepi32Epi16 ¶

func Cvtsepi32Epi16(a x86.M128i) (dst x86.M128i)

Cvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm_cvtsepi32_epi16'. Requires AVX512F.

func Cvtsepi32Epi8 ¶

func Cvtsepi32Epi8(a x86.M128i) (dst x86.M128i)

Cvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm_cvtsepi32_epi8'. Requires AVX512F.

func Cvtsepi64Epi16 ¶

func Cvtsepi64Epi16(a x86.M128i) (dst x86.M128i)

Cvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm_cvtsepi64_epi16'. Requires AVX512F.

func Cvtsepi64Epi32 ¶

func Cvtsepi64Epi32(a x86.M128i) (dst x86.M128i)

Cvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm_cvtsepi64_epi32'. Requires AVX512F.

func Cvtsepi64Epi8 ¶

func Cvtsepi64Epi8(a x86.M128i) (dst x86.M128i)

Cvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm_cvtsepi64_epi8'. Requires AVX512F.

func CvtssI32 ¶

func CvtssI32(a x86.M128) int

CvtssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvtss_i32'. Requires AVX512F.

func CvtssI64 ¶

func CvtssI64(a x86.M128) int64

CvtssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

dst[63:0] := Convert_FP32_To_Int64(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvtss_i64'. Requires AVX512F.

func CvtssU32 ¶

func CvtssU32(a x86.M128) uint32

CvtssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])

Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvtss_u32'. Requires AVX512F.

func CvtssU64 ¶

func CvtssU64(a x86.M128) uint64

CvtssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.

dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])

Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvtss_u64'. Requires AVX512F.

func CvttRoundsdI32 ¶

func CvttRoundsdI32(a x86.M128d, rounding int) int

CvttRoundsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_i32'. Requires AVX512F.

func CvttRoundsdI64 ¶

func CvttRoundsdI64(a x86.M128d, rounding int) int64

CvttRoundsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_i64'. Requires AVX512F.

func CvttRoundsdSi32 ¶

func CvttRoundsdSi32(a x86.M128d, rounding int) int

CvttRoundsdSi32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_si32'. Requires AVX512F.

func CvttRoundsdSi64 ¶

func CvttRoundsdSi64(a x86.M128d, rounding int) int64

CvttRoundsdSi64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_si64'. Requires AVX512F.

func CvttRoundsdU32 ¶

func CvttRoundsdU32(a x86.M128d, rounding int) uint32

CvttRoundsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])

Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvtt_roundsd_u32'. Requires AVX512F.

func CvttRoundsdU64 ¶

func CvttRoundsdU64(a x86.M128d, rounding int) uint64

CvttRoundsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])

Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvtt_roundsd_u64'. Requires AVX512F.

func CvttRoundssI32 ¶

func CvttRoundssI32(a x86.M128, rounding int) int

CvttRoundssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_i32'. Requires AVX512F.

func CvttRoundssI64 ¶

func CvttRoundssI64(a x86.M128, rounding int) int64

CvttRoundssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_i64'. Requires AVX512F.

func CvttRoundssSi32 ¶

func CvttRoundssSi32(a x86.M128, rounding int) int

CvttRoundssSi32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_si32'. Requires AVX512F.

func CvttRoundssSi64 ¶

func CvttRoundssSi64(a x86.M128, rounding int) int64

CvttRoundssSi64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_si64'. Requires AVX512F.

func CvttRoundssU32 ¶

func CvttRoundssU32(a x86.M128, rounding int) uint32

CvttRoundssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])

Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvtt_roundss_u32'. Requires AVX512F.

func CvttRoundssU64 ¶

func CvttRoundssU64(a x86.M128, rounding int) uint64

CvttRoundssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])

Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvtt_roundss_u64'. Requires AVX512F.

func CvttpdEpu32 ¶

func CvttpdEpu32(a x86.M128d) (dst x86.M128i)

CvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm_cvttpd_epu32'. Requires AVX512F.

func CvttpsEpu32 ¶

func CvttpsEpu32(a x86.M128) (dst x86.M128i)

CvttpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm_cvttps_epu32'. Requires AVX512F.

func CvttsdI32 ¶

func CvttsdI32(a x86.M128d) int

CvttsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvttsd_i32'. Requires AVX512F.

func CvttsdI64 ¶

func CvttsdI64(a x86.M128d) int64

CvttsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvttsd_i64'. Requires AVX512F.

func CvttsdU32 ¶

func CvttsdU32(a x86.M128d) uint32

CvttsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])

Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvttsd_u32'. Requires AVX512F.

func CvttsdU64 ¶

func CvttsdU64(a x86.M128d) uint64

CvttsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.

dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])

Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvttsd_u64'. Requires AVX512F.

func CvttssI32 ¶

func CvttssI32(a x86.M128) int

CvttssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvttss_i32'. Requires AVX512F.

func CvttssI64 ¶

func CvttssI64(a x86.M128) int64

CvttssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvttss_i64'. Requires AVX512F.

func CvttssU32 ¶

func CvttssU32(a x86.M128) uint32

CvttssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])

Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvttss_u32'. Requires AVX512F.

func CvttssU64 ¶

func CvttssU64(a x86.M128) uint64

CvttssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.

dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])

Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvttss_u64'. Requires AVX512F.

func Cvtu32Sd ¶

func Cvtu32Sd(a x86.M128d, b uint32) (dst x86.M128d)

Cvtu32Sd: Convert the unsigned 32-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[63:0] := Convert_UnsignedInt32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VCVTUSI2SD'. Intrinsic: '_mm_cvtu32_sd'. Requires AVX512F.

func Cvtu32Ss ¶

func Cvtu32Ss(a x86.M128, b uint32) (dst x86.M128)

Cvtu32Ss: Convert the unsigned 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvtu32_ss'. Requires AVX512F.

func Cvtu64Sd ¶

func Cvtu64Sd(a x86.M128d, b uint64) (dst x86.M128d)

Cvtu64Sd: Convert the unsigned 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VCVTUSI2SD'. Intrinsic: '_mm_cvtu64_sd'. Requires AVX512F.

func Cvtu64Ss ¶

func Cvtu64Ss(a x86.M128, b uint64) (dst x86.M128)

Cvtu64Ss: Convert the unsigned 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvtu64_ss'. Requires AVX512F.

func Cvtusepi32Epi16 ¶

func Cvtusepi32Epi16(a x86.M128i) (dst x86.M128i)

Cvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm_cvtusepi32_epi16'. Requires AVX512F.

func Cvtusepi32Epi8 ¶

func Cvtusepi32Epi8(a x86.M128i) (dst x86.M128i)

Cvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm_cvtusepi32_epi8'. Requires AVX512F.

func Cvtusepi64Epi16 ¶

func Cvtusepi64Epi16(a x86.M128i) (dst x86.M128i)

Cvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm_cvtusepi64_epi16'. Requires AVX512F.

func Cvtusepi64Epi32 ¶

func Cvtusepi64Epi32(a x86.M128i) (dst x86.M128i)

Cvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm_cvtusepi64_epi32'. Requires AVX512F.

func Cvtusepi64Epi8 ¶

func Cvtusepi64Epi8(a x86.M128i) (dst x86.M128i)

Cvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm_cvtusepi64_epi8'. Requires AVX512F.

func DivRoundSd ¶

func DivRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

DivRoundSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := a[63:0] / b[63:0]
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VDIVSD'. Intrinsic: '_mm_div_round_sd'. Requires AVX512F.

func DivRoundSs ¶

func DivRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

DivRoundSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := a[31:0] / b[31:0]
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VDIVSS'. Intrinsic: '_mm_div_round_ss'. Requires AVX512F.

func FixupimmPd ¶

func FixupimmPd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

FixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func FixupimmPs ¶

func FixupimmPs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

FixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func FixupimmRoundSd ¶

func FixupimmRoundSd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)

FixupimmRoundSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_fixupimm_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func FixupimmRoundSs ¶

func FixupimmRoundSs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)

FixupimmRoundSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_fixupimm_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func FixupimmSd ¶

func FixupimmSd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

FixupimmSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_fixupimm_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func FixupimmSs ¶

func FixupimmSs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

FixupimmSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_fixupimm_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func GetexpPd ¶

func GetexpPd(a x86.M128d) (dst x86.M128d)

GetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm_getexp_pd'. Requires AVX512F.

func GetexpPs ¶

func GetexpPs(a x86.M128) (dst x86.M128)

GetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm_getexp_ps'. Requires AVX512F.

func GetexpRoundSd ¶

func GetexpRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

GetexpRoundSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := ConvertExpFP64(b[63:0])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_getexp_round_sd'. Requires AVX512F.

func GetexpRoundSs ¶

func GetexpRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

GetexpRoundSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := ConvertExpFP32(b[31:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_getexp_round_ss'. Requires AVX512F.

func GetexpSd ¶

func GetexpSd(a x86.M128d, b x86.M128d) (dst x86.M128d)

GetexpSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

dst[63:0] := ConvertExpFP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_getexp_sd'. Requires AVX512F.

func GetexpSs ¶

func GetexpSs(a x86.M128, b x86.M128) (dst x86.M128)

GetexpSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

dst[31:0] := ConvertExpFP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_getexp_ss'. Requires AVX512F.

func GetmantPd ¶

func GetmantPd(a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

GetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 1
			i := j*64
			dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm_getmant_pd'. Requires AVX512F.

func GetmantPs ¶

func GetmantPs(a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

GetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*32
			dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm_getmant_ps'. Requires AVX512F.

func GetmantRoundSd ¶

func GetmantRoundSd(a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128d)

GetmantRoundSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_getmant_round_sd'. Requires AVX512F.

func GetmantRoundSs ¶

func GetmantRoundSs(a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128)

GetmantRoundSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_getmant_round_ss'. Requires AVX512F.

func GetmantSd ¶

func GetmantSd(a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

GetmantSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_getmant_sd'. Requires AVX512F.

func GetmantSs ¶

func GetmantSs(a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

GetmantSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_getmant_ss'. Requires AVX512F.

func M256AbsEpi64 ¶

func M256AbsEpi64(a x86.M256i) (dst x86.M256i)

M256AbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm256_abs_epi64'. Requires AVX512F.

func M256BroadcastF32x4 ¶

func M256BroadcastF32x4(a x86.M128) (dst x86.M256)

M256BroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm256_broadcast_f32x4'. Requires AVX512F.

func M256BroadcastI32x4 ¶

func M256BroadcastI32x4(a x86.M128i) (dst x86.M256i)

M256BroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm256_broadcast_i32x4'. Requires AVX512F.

func M256CmpEpi32Mask ¶

func M256CmpEpi32Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256CmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmp_epi32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpEpi64Mask ¶

func M256CmpEpi64Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256CmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpEpu32Mask ¶

func M256CmpEpu32Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256CmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmp_epu32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpEpu64Mask ¶

func M256CmpEpu64Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256CmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpPdMask ¶

func M256CmpPdMask(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.Mmask8)

M256CmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
	i := j*64
	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm256_cmp_pd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpPsMask ¶

func M256CmpPsMask(a x86.M256, b x86.M256, imm8 byte) (dst x86.Mmask8)

M256CmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
	i := j*32
	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm256_cmp_ps_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpeqEpi32Mask ¶

func M256CmpeqEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpeq_epi32_mask'. Requires AVX512F.

func M256CmpeqEpi64Mask ¶

func M256CmpeqEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpeq_epi64_mask'. Requires AVX512F.

func M256CmpeqEpu32Mask ¶

func M256CmpeqEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpeq_epu32_mask'. Requires AVX512F.

func M256CmpeqEpu64Mask ¶

func M256CmpeqEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpeq_epu64_mask'. Requires AVX512F.

func M256CmpgeEpi32Mask ¶

func M256CmpgeEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpge_epi32_mask'. Requires AVX512F.

func M256CmpgeEpi64Mask ¶

func M256CmpgeEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpge_epi64_mask'. Requires AVX512F.

func M256CmpgeEpu32Mask ¶

func M256CmpgeEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpge_epu32_mask'. Requires AVX512F.

func M256CmpgeEpu64Mask ¶

func M256CmpgeEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpge_epu64_mask'. Requires AVX512F.

func M256CmpgtEpi32Mask ¶

func M256CmpgtEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpgt_epi32_mask'. Requires AVX512F.

func M256CmpgtEpi64Mask ¶

func M256CmpgtEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpgt_epi64_mask'. Requires AVX512F.

func M256CmpgtEpu32Mask ¶

func M256CmpgtEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpgt_epu32_mask'. Requires AVX512F.

func M256CmpgtEpu64Mask ¶

func M256CmpgtEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpgt_epu64_mask'. Requires AVX512F.

func M256CmpleEpi32Mask ¶

func M256CmpleEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmple_epi32_mask'. Requires AVX512F.

func M256CmpleEpi64Mask ¶

func M256CmpleEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmple_epi64_mask'. Requires AVX512F.

func M256CmpleEpu32Mask ¶

func M256CmpleEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmple_epu32_mask'. Requires AVX512F.

func M256CmpleEpu64Mask ¶

func M256CmpleEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmple_epu64_mask'. Requires AVX512F.

func M256CmpltEpi32Mask ¶

func M256CmpltEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmplt_epi32_mask'. Requires AVX512F.

func M256CmpltEpi64Mask ¶

func M256CmpltEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmplt_epi64_mask'. Requires AVX512F.

func M256CmpltEpu32Mask ¶

func M256CmpltEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmplt_epu32_mask'. Requires AVX512F.

func M256CmpltEpu64Mask ¶

func M256CmpltEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmplt_epu64_mask'. Requires AVX512F.

func M256CmpneqEpi32Mask ¶

func M256CmpneqEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpneq_epi32_mask'. Requires AVX512F.

func M256CmpneqEpi64Mask ¶

func M256CmpneqEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpneq_epi64_mask'. Requires AVX512F.

func M256CmpneqEpu32Mask ¶

func M256CmpneqEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpneq_epu32_mask'. Requires AVX512F.

func M256CmpneqEpu64Mask ¶

func M256CmpneqEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpneq_epu64_mask'. Requires AVX512F.

func M256Cvtepi32Epi16 ¶

func M256Cvtepi32Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 16*j
	dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm256_cvtepi32_epi16'. Requires AVX512F.

func M256Cvtepi32Epi8 ¶

func M256Cvtepi32Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 8*j
	dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm256_cvtepi32_epi8'. Requires AVX512F.

func M256Cvtepi64Epi16 ¶

func M256Cvtepi64Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 16*j
	dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm256_cvtepi64_epi16'. Requires AVX512F.

func M256Cvtepi64Epi32 ¶

func M256Cvtepi64Epi32(a x86.M256i) (dst x86.M128i)

M256Cvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 32*j
	dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm256_cvtepi64_epi32'. Requires AVX512F.

func M256Cvtepi64Epi8 ¶

func M256Cvtepi64Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 8*j
	dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm256_cvtepi64_epi8'. Requires AVX512F.

func M256Cvtepu32Pd ¶

func M256Cvtepu32Pd(a x86.M128i) (dst x86.M256d)

M256Cvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm256_cvtepu32_pd'. Requires AVX512F.

func M256CvtpdEpu32 ¶

func M256CvtpdEpu32(a x86.M256d) (dst x86.M128i)

M256CvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm256_cvtpd_epu32'. Requires AVX512F.

func M256CvtpsEpu32 ¶

func M256CvtpsEpu32(a x86.M256) (dst x86.M256i)

M256CvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm256_cvtps_epu32'. Requires AVX512F.

func M256Cvtsepi32Epi16 ¶

func M256Cvtsepi32Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm256_cvtsepi32_epi16'. Requires AVX512F.

func M256Cvtsepi32Epi8 ¶

func M256Cvtsepi32Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm256_cvtsepi32_epi8'. Requires AVX512F.

func M256Cvtsepi64Epi16 ¶

func M256Cvtsepi64Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm256_cvtsepi64_epi16'. Requires AVX512F.

func M256Cvtsepi64Epi32 ¶

func M256Cvtsepi64Epi32(a x86.M256i) (dst x86.M128i)

M256Cvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm256_cvtsepi64_epi32'. Requires AVX512F.

func M256Cvtsepi64Epi8 ¶

func M256Cvtsepi64Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm256_cvtsepi64_epi8'. Requires AVX512F.

func M256CvttpdEpu32 ¶

func M256CvttpdEpu32(a x86.M256d) (dst x86.M128i)

M256CvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm256_cvttpd_epu32'. Requires AVX512F.

func M256CvttpsEpu32 ¶

func M256CvttpsEpu32(a x86.M256) (dst x86.M256i)

M256CvttpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm256_cvttps_epu32'. Requires AVX512F.

func M256Cvtusepi32Epi16 ¶

func M256Cvtusepi32Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm256_cvtusepi32_epi16'. Requires AVX512F.

func M256Cvtusepi32Epi8 ¶

func M256Cvtusepi32Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm256_cvtusepi32_epi8'. Requires AVX512F.

func M256Cvtusepi64Epi16 ¶

func M256Cvtusepi64Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm256_cvtusepi64_epi16'. Requires AVX512F.

func M256Cvtusepi64Epi32 ¶

func M256Cvtusepi64Epi32(a x86.M256i) (dst x86.M128i)

M256Cvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm256_cvtusepi64_epi32'. Requires AVX512F.

func M256Cvtusepi64Epi8 ¶

func M256Cvtusepi64Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm256_cvtusepi64_epi8'. Requires AVX512F.

func M256Extractf32x4Ps ¶

func M256Extractf32x4Ps(a x86.M256, imm8 byte) (dst x86.M128)

M256Extractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm256_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256Extracti32x4Epi32 ¶

func M256Extracti32x4Epi32(a x86.M256i, imm8 byte) (dst x86.M128i)

M256Extracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm256_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256FixupimmPd ¶

func M256FixupimmPd(a x86.M256d, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)

M256FixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm256_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256FixupimmPs ¶

func M256FixupimmPs(a x86.M256, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)

M256FixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm256_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256GetexpPd ¶

func M256GetexpPd(a x86.M256d) (dst x86.M256d)

M256GetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm256_getexp_pd'. Requires AVX512F.

func M256GetexpPs ¶

func M256GetexpPs(a x86.M256) (dst x86.M256)

M256GetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm256_getexp_ps'. Requires AVX512F.

func M256GetmantPd ¶

func M256GetmantPd(a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)

M256GetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*64
			dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm256_getmant_pd'. Requires AVX512F.

func M256GetmantPs ¶

func M256GetmantPs(a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)

M256GetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 7
			i := j*32
			dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm256_getmant_ps'. Requires AVX512F.

func M256Insertf32x4 ¶

func M256Insertf32x4(a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)

M256Insertf32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm256_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256Inserti32x4 ¶

func M256Inserti32x4(a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256Inserti32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm256_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256Mask2Permutex2varEpi32 ¶

func M256Mask2Permutex2varEpi32(a x86.M256i, idx x86.M256i, k x86.Mmask8, b x86.M256i) (dst x86.M256i)

M256Mask2Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2D'. Intrinsic: '_mm256_mask2_permutex2var_epi32'. Requires AVX512F.

func M256Mask2Permutex2varEpi64 ¶

func M256Mask2Permutex2varEpi64(a x86.M256i, idx x86.M256i, k x86.Mmask8, b x86.M256i) (dst x86.M256i)

M256Mask2Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2Q'. Intrinsic: '_mm256_mask2_permutex2var_epi64'. Requires AVX512F.

func M256Mask2Permutex2varPd ¶

func M256Mask2Permutex2varPd(a x86.M256d, idx x86.M256i, k x86.Mmask8, b x86.M256d) (dst x86.M256d)

M256Mask2Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PD'. Intrinsic: '_mm256_mask2_permutex2var_pd'. Requires AVX512F.

func M256Mask2Permutex2varPs ¶

func M256Mask2Permutex2varPs(a x86.M256, idx x86.M256i, k x86.Mmask8, b x86.M256) (dst x86.M256)

M256Mask2Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PS'. Intrinsic: '_mm256_mask2_permutex2var_ps'. Requires AVX512F.

func M256Mask3FmaddPd ¶

func M256Mask3FmaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm256_mask3_fmadd_pd'. Requires AVX512F.

func M256Mask3FmaddPs ¶

func M256Mask3FmaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm256_mask3_fmadd_ps'. Requires AVX512F.

func M256Mask3FmaddsubPd ¶

func M256Mask3FmaddsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm256_mask3_fmaddsub_pd'. Requires AVX512F.

func M256Mask3FmaddsubPs ¶

func M256Mask3FmaddsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm256_mask3_fmaddsub_ps'. Requires AVX512F.

func M256Mask3FmsubPd ¶

func M256Mask3FmsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm256_mask3_fmsub_pd'. Requires AVX512F.

func M256Mask3FmsubPs ¶

func M256Mask3FmsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm256_mask3_fmsub_ps'. Requires AVX512F.

func M256Mask3FmsubaddPd ¶

func M256Mask3FmsubaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm256_mask3_fmsubadd_pd'. Requires AVX512F.

func M256Mask3FmsubaddPs ¶

func M256Mask3FmsubaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm256_mask3_fmsubadd_ps'. Requires AVX512F.

func M256Mask3FnmaddPd ¶

func M256Mask3FnmaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm256_mask3_fnmadd_pd'. Requires AVX512F.

func M256Mask3FnmaddPs ¶

func M256Mask3FnmaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm256_mask3_fnmadd_ps'. Requires AVX512F.

func M256Mask3FnmsubPd ¶

func M256Mask3FnmsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm256_mask3_fnmsub_pd'. Requires AVX512F.

func M256Mask3FnmsubPs ¶

func M256Mask3FnmsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm256_mask3_fnmsub_ps'. Requires AVX512F.

func M256MaskAbsEpi32 ¶

func M256MaskAbsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm256_mask_abs_epi32'. Requires AVX512F.

func M256MaskAbsEpi64 ¶

func M256MaskAbsEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm256_mask_abs_epi64'. Requires AVX512F.

func M256MaskAddEpi32 ¶

func M256MaskAddEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm256_mask_add_epi32'. Requires AVX512F.

func M256MaskAddEpi64 ¶

func M256MaskAddEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm256_mask_add_epi64'. Requires AVX512F.

func M256MaskAndEpi32 ¶

func M256MaskAndEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm256_mask_and_epi32'. Requires AVX512F.

func M256MaskAndEpi64 ¶

func M256MaskAndEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm256_mask_and_epi64'. Requires AVX512F.

func M256MaskAndnotEpi32 ¶

func M256MaskAndnotEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm256_mask_andnot_epi32'. Requires AVX512F.

func M256MaskAndnotEpi64 ¶

func M256MaskAndnotEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm256_mask_andnot_epi64'. Requires AVX512F.

func M256MaskBlendEpi32 ¶

func M256MaskBlendEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskBlendEpi32: Blend packed 32-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDMD'. Intrinsic: '_mm256_mask_blend_epi32'. Requires AVX512F.

func M256MaskBlendEpi64 ¶

func M256MaskBlendEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskBlendEpi64: Blend packed 64-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDMQ'. Intrinsic: '_mm256_mask_blend_epi64'. Requires AVX512F.

func M256MaskBlendPd ¶

func M256MaskBlendPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskBlendPd: Blend packed double-precision (64-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBLENDMPD'. Intrinsic: '_mm256_mask_blend_pd'. Requires AVX512F.

func M256MaskBlendPs ¶

func M256MaskBlendPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskBlendPs: Blend packed single-precision (32-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBLENDMPS'. Intrinsic: '_mm256_mask_blend_ps'. Requires AVX512F.

func M256MaskBroadcastF32x4 ¶

func M256MaskBroadcastF32x4(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm256_mask_broadcast_f32x4'. Requires AVX512F.

func M256MaskBroadcastI32x4 ¶

func M256MaskBroadcastI32x4(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm256_mask_broadcast_i32x4'. Requires AVX512F.

func M256MaskBroadcastdEpi32 ¶

func M256MaskBroadcastdEpi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_mask_broadcastd_epi32'. Requires AVX512F.

func M256MaskBroadcastqEpi64 ¶

func M256MaskBroadcastqEpi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_mask_broadcastq_epi64'. Requires AVX512F.

func M256MaskBroadcastsdPd ¶

func M256MaskBroadcastsdPd(src x86.M256d, k x86.Mmask8, a x86.M128d) (dst x86.M256d)

M256MaskBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm256_mask_broadcastsd_pd'. Requires AVX512F.

func M256MaskBroadcastssPs ¶

func M256MaskBroadcastssPs(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm256_mask_broadcastss_ps'. Requires AVX512F.

func M256MaskCmpEpi32Mask ¶

func M256MaskCmpEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256MaskCmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmp_epi32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpi64Mask ¶

func M256MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256MaskCmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpu32Mask ¶

func M256MaskCmpEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256MaskCmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmp_epu32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpu64Mask ¶

func M256MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256MaskCmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpPdMask ¶

func M256MaskCmpPdMask(k1 x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.Mmask8)

M256MaskCmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm256_mask_cmp_pd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpPsMask ¶

func M256MaskCmpPsMask(k1 x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.Mmask8)

M256MaskCmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm256_mask_cmp_ps_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpeqEpi32Mask ¶

func M256MaskCmpeqEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpeq_epi32_mask'. Requires AVX512F.

func M256MaskCmpeqEpi64Mask ¶

func M256MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpeq_epi64_mask'. Requires AVX512F.

func M256MaskCmpeqEpu32Mask ¶

func M256MaskCmpeqEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpeq_epu32_mask'. Requires AVX512F.

func M256MaskCmpeqEpu64Mask ¶

func M256MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpeq_epu64_mask'. Requires AVX512F.

func M256MaskCmpgeEpi32Mask ¶

func M256MaskCmpgeEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpge_epi32_mask'. Requires AVX512F.

func M256MaskCmpgeEpi64Mask ¶

func M256MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpge_epi64_mask'. Requires AVX512F.

func M256MaskCmpgeEpu32Mask ¶

func M256MaskCmpgeEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpge_epu32_mask'. Requires AVX512F.

func M256MaskCmpgeEpu64Mask ¶

func M256MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpge_epu64_mask'. Requires AVX512F.

func M256MaskCmpgtEpi32Mask ¶

func M256MaskCmpgtEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpgt_epi32_mask'. Requires AVX512F.

func M256MaskCmpgtEpi64Mask ¶

func M256MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpgt_epi64_mask'. Requires AVX512F.

func M256MaskCmpgtEpu32Mask ¶

func M256MaskCmpgtEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpgt_epu32_mask'. Requires AVX512F.

func M256MaskCmpgtEpu64Mask ¶

func M256MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpgt_epu64_mask'. Requires AVX512F.

func M256MaskCmpleEpi32Mask ¶

func M256MaskCmpleEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmple_epi32_mask'. Requires AVX512F.

func M256MaskCmpleEpi64Mask ¶

func M256MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmple_epi64_mask'. Requires AVX512F.

func M256MaskCmpleEpu32Mask ¶

func M256MaskCmpleEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmple_epu32_mask'. Requires AVX512F.

func M256MaskCmpleEpu64Mask ¶

func M256MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmple_epu64_mask'. Requires AVX512F.

func M256MaskCmpltEpi32Mask ¶

func M256MaskCmpltEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmplt_epi32_mask'. Requires AVX512F.

func M256MaskCmpltEpi64Mask ¶

func M256MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmplt_epi64_mask'. Requires AVX512F.

func M256MaskCmpltEpu32Mask ¶

func M256MaskCmpltEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmplt_epu32_mask'. Requires AVX512F.

func M256MaskCmpltEpu64Mask ¶

func M256MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmplt_epu64_mask'. Requires AVX512F.

func M256MaskCmpneqEpi32Mask ¶

func M256MaskCmpneqEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpneq_epi32_mask'. Requires AVX512F.

func M256MaskCmpneqEpi64Mask ¶

func M256MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpneq_epi64_mask'. Requires AVX512F.

func M256MaskCmpneqEpu32Mask ¶

func M256MaskCmpneqEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpneq_epu32_mask'. Requires AVX512F.

func M256MaskCmpneqEpu64Mask ¶

func M256MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpneq_epu64_mask'. Requires AVX512F.

func M256MaskCompressEpi32 ¶

func M256MaskCompressEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm256_mask_compress_epi32'. Requires AVX512F.

func M256MaskCompressEpi64 ¶

func M256MaskCompressEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm256_mask_compress_epi64'. Requires AVX512F.

func M256MaskCompressPd ¶

func M256MaskCompressPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm256_mask_compress_pd'. Requires AVX512F.

func M256MaskCompressPs ¶

func M256MaskCompressPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm256_mask_compress_ps'. Requires AVX512F.

func M256MaskCvtRoundpsPh ¶

func M256MaskCvtRoundpsPh(src x86.M128i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)

M256MaskCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_mask_cvt_roundps_ph'. Requires AVX512F.

func M256MaskCvtepi16Epi32 ¶

func M256MaskCvtepi16Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	l := j*16
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm256_mask_cvtepi16_epi32'. Requires AVX512F.

func M256MaskCvtepi16Epi64 ¶

func M256MaskCvtepi16Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi16Epi64: Sign extend packed 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm256_mask_cvtepi16_epi64'. Requires AVX512F.

func M256MaskCvtepi32Epi16 ¶

func M256MaskCvtepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm256_mask_cvtepi32_epi16'. Requires AVX512F.

func M256MaskCvtepi32Epi64 ¶

func M256MaskCvtepi32Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm256_mask_cvtepi32_epi64'. Requires AVX512F.

func M256MaskCvtepi32Epi8 ¶

func M256MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm256_mask_cvtepi32_epi8'. Requires AVX512F.

func M256MaskCvtepi32Pd ¶

func M256MaskCvtepi32Pd(src x86.M256d, k x86.Mmask8, a x86.M128i) (dst x86.M256d)

M256MaskCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := src[m+63:m]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm256_mask_cvtepi32_pd'. Requires AVX512F.

func M256MaskCvtepi32Ps ¶

func M256MaskCvtepi32Ps(src x86.M256, k x86.Mmask8, a x86.M256i) (dst x86.M256)

M256MaskCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm256_mask_cvtepi32_ps'. Requires AVX512F.

func M256MaskCvtepi64Epi16 ¶

func M256MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm256_mask_cvtepi64_epi16'. Requires AVX512F.

func M256MaskCvtepi64Epi32 ¶

func M256MaskCvtepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm256_mask_cvtepi64_epi32'. Requires AVX512F.

func M256MaskCvtepi64Epi8 ¶

func M256MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm256_mask_cvtepi64_epi8'. Requires AVX512F.

func M256MaskCvtepi8Epi32 ¶

func M256MaskCvtepi8Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi8Epi32: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm256_mask_cvtepi8_epi32'. Requires AVX512F.

func M256MaskCvtepi8Epi64 ¶

func M256MaskCvtepi8Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi8Epi64: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm256_mask_cvtepi8_epi64'. Requires AVX512F.

func M256MaskCvtepu16Epi32 ¶

func M256MaskCvtepu16Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm256_mask_cvtepu16_epi32'. Requires AVX512F.

func M256MaskCvtepu16Epi64 ¶

func M256MaskCvtepu16Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm256_mask_cvtepu16_epi64'. Requires AVX512F.

func M256MaskCvtepu32Epi64 ¶

func M256MaskCvtepu32Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm256_mask_cvtepu32_epi64'. Requires AVX512F.

func M256MaskCvtepu32Pd ¶

func M256MaskCvtepu32Pd(src x86.M256d, k x86.Mmask8, a x86.M128i) (dst x86.M256d)

M256MaskCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm256_mask_cvtepu32_pd'. Requires AVX512F.

func M256MaskCvtepu8Epi32 ¶

func M256MaskCvtepu8Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm256_mask_cvtepu8_epi32'. Requires AVX512F.

func M256MaskCvtepu8Epi64 ¶

func M256MaskCvtepu8Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm256_mask_cvtepu8_epi64'. Requires AVX512F.

func M256MaskCvtpdEpi32 ¶

func M256MaskCvtpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm256_mask_cvtpd_epi32'. Requires AVX512F.

func M256MaskCvtpdEpu32 ¶

func M256MaskCvtpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm256_mask_cvtpd_epu32'. Requires AVX512F.

func M256MaskCvtpdPs ¶

func M256MaskCvtpdPs(src x86.M128, k x86.Mmask8, a x86.M256d) (dst x86.M128)

M256MaskCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm256_mask_cvtpd_ps'. Requires AVX512F.

func M256MaskCvtphPs ¶

func M256MaskCvtphPs(src x86.M256, k x86.Mmask8, a x86.M128i) (dst x86.M256)

M256MaskCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm256_mask_cvtph_ps'. Requires AVX512F.

func M256MaskCvtpsEpi32 ¶

func M256MaskCvtpsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm256_mask_cvtps_epi32'. Requires AVX512F.

func M256MaskCvtpsEpu32 ¶

func M256MaskCvtpsEpu32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm256_mask_cvtps_epu32'. Requires AVX512F.

func M256MaskCvtpsPh ¶

func M256MaskCvtpsPh(src x86.M128i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)

M256MaskCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_mask_cvtps_ph'. Requires AVX512F.

func M256MaskCvtsepi32Epi16 ¶

func M256MaskCvtsepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm256_mask_cvtsepi32_epi16'. Requires AVX512F.

func M256MaskCvtsepi32Epi8 ¶

func M256MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm256_mask_cvtsepi32_epi8'. Requires AVX512F.

func M256MaskCvtsepi64Epi16 ¶

func M256MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm256_mask_cvtsepi64_epi16'. Requires AVX512F.

func M256MaskCvtsepi64Epi32 ¶

func M256MaskCvtsepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm256_mask_cvtsepi64_epi32'. Requires AVX512F.

func M256MaskCvtsepi64Epi8 ¶

func M256MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm256_mask_cvtsepi64_epi8'. Requires AVX512F.

func M256MaskCvttpdEpi32 ¶

func M256MaskCvttpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm256_mask_cvttpd_epi32'. Requires AVX512F.

func M256MaskCvttpdEpu32 ¶

func M256MaskCvttpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm256_mask_cvttpd_epu32'. Requires AVX512F.

func M256MaskCvttpsEpi32 ¶

func M256MaskCvttpsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm256_mask_cvttps_epi32'. Requires AVX512F.

func M256MaskCvttpsEpu32 ¶

func M256MaskCvttpsEpu32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm256_mask_cvttps_epu32'. Requires AVX512F.

func M256MaskCvtusepi32Epi16 ¶

func M256MaskCvtusepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm256_mask_cvtusepi32_epi16'. Requires AVX512F.

func M256MaskCvtusepi32Epi8 ¶

func M256MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm256_mask_cvtusepi32_epi8'. Requires AVX512F.

func M256MaskCvtusepi64Epi16 ¶

func M256MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm256_mask_cvtusepi64_epi16'. Requires AVX512F.

func M256MaskCvtusepi64Epi32 ¶

func M256MaskCvtusepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm256_mask_cvtusepi64_epi32'. Requires AVX512F.

func M256MaskCvtusepi64Epi8 ¶

func M256MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm256_mask_cvtusepi64_epi8'. Requires AVX512F.

func M256MaskDivPd ¶

func M256MaskDivPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm256_mask_div_pd'. Requires AVX512F.

func M256MaskDivPs ¶

func M256MaskDivPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm256_mask_div_ps'. Requires AVX512F.

func M256MaskExpandEpi32 ¶

func M256MaskExpandEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm256_mask_expand_epi32'. Requires AVX512F.

func M256MaskExpandEpi64 ¶

func M256MaskExpandEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm256_mask_expand_epi64'. Requires AVX512F.

func M256MaskExpandPd ¶

func M256MaskExpandPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm256_mask_expand_pd'. Requires AVX512F.

func M256MaskExpandPs ¶

func M256MaskExpandPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm256_mask_expand_ps'. Requires AVX512F.

func M256MaskExtractf32x4Ps ¶

func M256MaskExtractf32x4Ps(src x86.M128, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M128)

M256MaskExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm256_mask_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskExtracti32x4Epi32 ¶

func M256MaskExtracti32x4Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)

M256MaskExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm256_mask_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskFixupimmPd ¶

func M256MaskFixupimmPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)

M256MaskFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm256_mask_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskFixupimmPs ¶

func M256MaskFixupimmPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)

M256MaskFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm256_mask_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskFmaddPd ¶

func M256MaskFmaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm256_mask_fmadd_pd'. Requires AVX512F.

func M256MaskFmaddPs ¶

func M256MaskFmaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm256_mask_fmadd_ps'. Requires AVX512F.

func M256MaskFmaddsubPd ¶

func M256MaskFmaddsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm256_mask_fmaddsub_pd'. Requires AVX512F.

func M256MaskFmaddsubPs ¶

func M256MaskFmaddsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm256_mask_fmaddsub_ps'. Requires AVX512F.

func M256MaskFmsubPd ¶

func M256MaskFmsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm256_mask_fmsub_pd'. Requires AVX512F.

func M256MaskFmsubPs ¶

func M256MaskFmsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm256_mask_fmsub_ps'. Requires AVX512F.

func M256MaskFmsubaddPd ¶

func M256MaskFmsubaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm256_mask_fmsubadd_pd'. Requires AVX512F.

func M256MaskFmsubaddPs ¶

func M256MaskFmsubaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm256_mask_fmsubadd_ps'. Requires AVX512F.

func M256MaskFnmaddPd ¶

func M256MaskFnmaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm256_mask_fnmadd_pd'. Requires AVX512F.

func M256MaskFnmaddPs ¶

func M256MaskFnmaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm256_mask_fnmadd_ps'. Requires AVX512F.

func M256MaskFnmsubPd ¶

func M256MaskFnmsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm256_mask_fnmsub_pd'. Requires AVX512F.

func M256MaskFnmsubPs ¶

func M256MaskFnmsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm256_mask_fnmsub_ps'. Requires AVX512F.

func M256MaskGetexpPd ¶

func M256MaskGetexpPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm256_mask_getexp_pd'. Requires AVX512F.

func M256MaskGetexpPs ¶

func M256MaskGetexpPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm256_mask_getexp_ps'. Requires AVX512F.

func M256MaskGetmantPd ¶

func M256MaskGetmantPd(src x86.M256d, k x86.Mmask8, a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)

M256MaskGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm256_mask_getmant_pd'. Requires AVX512F.

func M256MaskGetmantPs ¶

func M256MaskGetmantPs(src x86.M256, k x86.Mmask8, a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)

M256MaskGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 7
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm256_mask_getmant_ps'. Requires AVX512F.

func M256MaskInsertf32x4 ¶

func M256MaskInsertf32x4(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)

M256MaskInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm256_mask_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskInserti32x4 ¶

func M256MaskInserti32x4(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256MaskInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm256_mask_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskMaxEpi32 ¶

func M256MaskMaxEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm256_mask_max_epi32'. Requires AVX512F.

func M256MaskMaxEpi64 ¶

func M256MaskMaxEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm256_mask_max_epi64'. Requires AVX512F.

func M256MaskMaxEpu32 ¶

func M256MaskMaxEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm256_mask_max_epu32'. Requires AVX512F.

func M256MaskMaxEpu64 ¶

func M256MaskMaxEpu64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm256_mask_max_epu64'. Requires AVX512F.

func M256MaskMaxPd ¶

func M256MaskMaxPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm256_mask_max_pd'. Requires AVX512F.

func M256MaskMaxPs ¶

func M256MaskMaxPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm256_mask_max_ps'. Requires AVX512F.

func M256MaskMinEpi32 ¶

func M256MaskMinEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm256_mask_min_epi32'. Requires AVX512F.

func M256MaskMinEpi64 ¶

func M256MaskMinEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm256_mask_min_epi64'. Requires AVX512F.

func M256MaskMinEpu32 ¶

func M256MaskMinEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm256_mask_min_epu32'. Requires AVX512F.

func M256MaskMinEpu64 ¶

func M256MaskMinEpu64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm256_mask_min_epu64'. Requires AVX512F.

func M256MaskMinPd ¶

func M256MaskMinPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm256_mask_min_pd'. Requires AVX512F.

func M256MaskMinPs ¶

func M256MaskMinPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm256_mask_min_ps'. Requires AVX512F.

func M256MaskMovEpi32 ¶

func M256MaskMovEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskMovEpi32: Move packed 32-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm256_mask_mov_epi32'. Requires AVX512F.

func M256MaskMovEpi64 ¶

func M256MaskMovEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskMovEpi64: Move packed 64-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm256_mask_mov_epi64'. Requires AVX512F.

func M256MaskMovPd ¶

func M256MaskMovPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm256_mask_mov_pd'. Requires AVX512F.

func M256MaskMovPs ¶

func M256MaskMovPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm256_mask_mov_ps'. Requires AVX512F.

func M256MaskMovedupPd ¶

func M256MaskMovedupPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm256_mask_movedup_pd'. Requires AVX512F.

func M256MaskMovehdupPs ¶

func M256MaskMovehdupPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm256_mask_movehdup_ps'. Requires AVX512F.

func M256MaskMoveldupPs ¶

func M256MaskMoveldupPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm256_mask_moveldup_ps'. Requires AVX512F.

func M256MaskMulEpi32 ¶

func M256MaskMulEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm256_mask_mul_epi32'. Requires AVX512F.

func M256MaskMulEpu32 ¶

func M256MaskMulEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm256_mask_mul_epu32'. Requires AVX512F.

func M256MaskMulPd ¶

func M256MaskMulPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm256_mask_mul_pd'. Requires AVX512F.

func M256MaskMulPs ¶

func M256MaskMulPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). RM.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm256_mask_mul_ps'. Requires AVX512F.

func M256MaskMulloEpi32 ¶

func M256MaskMulloEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm256_mask_mullo_epi32'. Requires AVX512F.

func M256MaskOrEpi32 ¶

func M256MaskOrEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPORD'. Intrinsic: '_mm256_mask_or_epi32'. Requires AVX512F.

func M256MaskOrEpi64 ¶

func M256MaskOrEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm256_mask_or_epi64'. Requires AVX512F.

func M256MaskPermutePd ¶

func M256MaskPermutePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm256_mask_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskPermutePs ¶

func M256MaskPermutePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm256_mask_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskPermutevarPd ¶

func M256MaskPermutevarPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256i) (dst x86.M256d)

M256MaskPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm256_mask_permutevar_pd'. Requires AVX512F.

func M256MaskPermutevarPs ¶

func M256MaskPermutevarPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256i) (dst x86.M256)

M256MaskPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm256_mask_permutevar_ps'. Requires AVX512F.

func M256MaskPermutex2varEpi32 ¶

func M256MaskPermutex2varEpi32(a x86.M256i, k x86.Mmask8, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMT2D'. Intrinsic: '_mm256_mask_permutex2var_epi32'. Requires AVX512F.

func M256MaskPermutex2varEpi64 ¶

func M256MaskPermutex2varEpi64(a x86.M256i, k x86.Mmask8, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMT2Q'. Intrinsic: '_mm256_mask_permutex2var_epi64'. Requires AVX512F.

func M256MaskPermutex2varPd ¶

func M256MaskPermutex2varPd(a x86.M256d, k x86.Mmask8, idx x86.M256i, b x86.M256d) (dst x86.M256d)

M256MaskPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMT2PD'. Intrinsic: '_mm256_mask_permutex2var_pd'. Requires AVX512F.

func M256MaskPermutex2varPs ¶

func M256MaskPermutex2varPs(a x86.M256, k x86.Mmask8, idx x86.M256i, b x86.M256) (dst x86.M256)

M256MaskPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMT2PS'. Intrinsic: '_mm256_mask_permutex2var_ps'. Requires AVX512F.

func M256MaskPermutexEpi64 ¶

func M256MaskPermutexEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskPermutexEpi64: Shuffle 64-bit integers in 'a' across lanes lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_mask_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskPermutexPd ¶

func M256MaskPermutexPd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_mask_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskPermutexvarEpi32 ¶

func M256MaskPermutexvarEpi32(src x86.M256i, k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm256_mask_permutexvar_epi32'. Requires AVX512F.

func M256MaskPermutexvarEpi64 ¶

func M256MaskPermutexvarEpi64(src x86.M256i, k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_mask_permutexvar_epi64'. Requires AVX512F.

func M256MaskPermutexvarPd ¶

func M256MaskPermutexvarPd(src x86.M256d, k x86.Mmask8, idx x86.M256i, a x86.M256d) (dst x86.M256d)

M256MaskPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_mask_permutexvar_pd'. Requires AVX512F.

func M256MaskPermutexvarPs ¶

func M256MaskPermutexvarPs(src x86.M256, k x86.Mmask8, idx x86.M256i, a x86.M256) (dst x86.M256)

M256MaskPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm256_mask_permutexvar_ps'. Requires AVX512F.

func M256MaskRcp14Pd ¶

func M256MaskRcp14Pd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm256_mask_rcp14_pd'. Requires AVX512F.

func M256MaskRcp14Ps ¶

func M256MaskRcp14Ps(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm256_mask_rcp14_ps'. Requires AVX512F.

func M256MaskRolEpi32 ¶

func M256MaskRolEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm256_mask_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRolEpi64 ¶

func M256MaskRolEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm256_mask_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRolvEpi32 ¶

func M256MaskRolvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm256_mask_rolv_epi32'. Requires AVX512F.

func M256MaskRolvEpi64 ¶

func M256MaskRolvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm256_mask_rolv_epi64'. Requires AVX512F.

func M256MaskRorEpi32 ¶

func M256MaskRorEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm256_mask_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRorEpi64 ¶

func M256MaskRorEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm256_mask_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRorvEpi32 ¶

func M256MaskRorvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm256_mask_rorv_epi32'. Requires AVX512F.

func M256MaskRorvEpi64 ¶

func M256MaskRorvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm256_mask_rorv_epi64'. Requires AVX512F.

func M256MaskRoundscalePd ¶

func M256MaskRoundscalePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm256_mask_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRoundscalePs ¶

func M256MaskRoundscalePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm256_mask_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRsqrt14Pd ¶

func M256MaskRsqrt14Pd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm256_mask_rsqrt14_pd'. Requires AVX512F.

func M256MaskRsqrt14Ps ¶

func M256MaskRsqrt14Ps(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm256_mask_rsqrt14_ps'. Requires AVX512F.

func M256MaskScalefPd ¶

func M256MaskScalefPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm256_mask_scalef_pd'. Requires AVX512F.

func M256MaskScalefPs ¶

func M256MaskScalefPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm256_mask_scalef_ps'. Requires AVX512F.

func M256MaskSet1Epi32 ¶

func M256MaskSet1Epi32(src x86.M256i, k x86.Mmask8, a int) (dst x86.M256i)

M256MaskSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_mask_set1_epi32'. Requires AVX512F.

func M256MaskSet1Epi64 ¶

func M256MaskSet1Epi64(src x86.M256i, k x86.Mmask8, a int64) (dst x86.M256i)

M256MaskSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_mask_set1_epi64'. Requires AVX512F.

func M256MaskShuffleEpi32 ¶

func M256MaskShuffleEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm256_mask_shuffle_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShuffleF32x4 ¶

func M256MaskShuffleF32x4(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm256_mask_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShuffleF64x2 ¶

func M256MaskShuffleF64x2(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm256_mask_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShuffleI32x4 ¶

func M256MaskShuffleI32x4(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm256_mask_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShuffleI64x2 ¶

func M256MaskShuffleI64x2(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm256_mask_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShufflePd ¶

func M256MaskShufflePd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm256_mask_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShufflePs ¶

func M256MaskShufflePs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm256_mask_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSllEpi32 ¶

func M256MaskSllEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm256_mask_sll_epi32'. Requires AVX512F.

func M256MaskSllEpi64 ¶

func M256MaskSllEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm256_mask_sll_epi64'. Requires AVX512F.

func M256MaskSlliEpi32 ¶

func M256MaskSlliEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm256_mask_slli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSlliEpi64 ¶

func M256MaskSlliEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm256_mask_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSllvEpi32 ¶

func M256MaskSllvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm256_mask_sllv_epi32'. Requires AVX512F.

func M256MaskSllvEpi64 ¶

func M256MaskSllvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm256_mask_sllv_epi64'. Requires AVX512F.

func M256MaskSqrtPd ¶

func M256MaskSqrtPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm256_mask_sqrt_pd'. Requires AVX512F.

func M256MaskSqrtPs ¶

func M256MaskSqrtPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm256_mask_sqrt_ps'. Requires AVX512F.

func M256MaskSraEpi32 ¶

func M256MaskSraEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm256_mask_sra_epi32'. Requires AVX512F.

func M256MaskSraEpi64 ¶

func M256MaskSraEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_mask_sra_epi64'. Requires AVX512F.

func M256MaskSraiEpi32 ¶

func M256MaskSraiEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm256_mask_srai_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSraiEpi64 ¶

func M256MaskSraiEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_mask_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSravEpi32 ¶

func M256MaskSravEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm256_mask_srav_epi32'. Requires AVX512F.

func M256MaskSravEpi64 ¶

func M256MaskSravEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm256_mask_srav_epi64'. Requires AVX512F.

func M256MaskSrlEpi32 ¶

func M256MaskSrlEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm256_mask_srl_epi32'. Requires AVX512F.

func M256MaskSrlEpi64 ¶

func M256MaskSrlEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm256_mask_srl_epi64'. Requires AVX512F.

func M256MaskSrliEpi32 ¶

func M256MaskSrliEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm256_mask_srli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSrliEpi64 ¶

func M256MaskSrliEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm256_mask_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSrlvEpi32 ¶

func M256MaskSrlvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm256_mask_srlv_epi32'. Requires AVX512F.

func M256MaskSrlvEpi64 ¶

func M256MaskSrlvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm256_mask_srlv_epi64'. Requires AVX512F.

func M256MaskSubEpi32 ¶

func M256MaskSubEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm256_mask_sub_epi32'. Requires AVX512F.

func M256MaskSubEpi64 ¶

func M256MaskSubEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm256_mask_sub_epi64'. Requires AVX512F.

func M256MaskSubPd ¶

func M256MaskSubPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm256_mask_sub_pd'. Requires AVX512F.

func M256MaskSubPs ¶

func M256MaskSubPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm256_mask_sub_ps'. Requires AVX512F.

func M256MaskTernarylogicEpi32 ¶

func M256MaskTernarylogicEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 32-bit granularity (32-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm256_mask_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskTernarylogicEpi64 ¶

func M256MaskTernarylogicEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 64-bit granularity (64-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm256_mask_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskTestEpi32Mask ¶

func M256MaskTestEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskTestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMD'. Intrinsic: '_mm256_mask_test_epi32_mask'. Requires AVX512F.

func M256MaskTestEpi64Mask ¶

func M256MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskTestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm256_mask_test_epi64_mask'. Requires AVX512F.

func M256MaskTestnEpi32Mask ¶

func M256MaskTestnEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskTestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm256_mask_testn_epi32_mask'. Requires AVX512F.

func M256MaskTestnEpi64Mask ¶

func M256MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskTestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm256_mask_testn_epi64_mask'. Requires AVX512F.

func M256MaskUnpackhiEpi32 ¶

func M256MaskUnpackhiEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm256_mask_unpackhi_epi32'. Requires AVX512F.

func M256MaskUnpackhiEpi64 ¶

func M256MaskUnpackhiEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm256_mask_unpackhi_epi64'. Requires AVX512F.

func M256MaskUnpackhiPd ¶

func M256MaskUnpackhiPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm256_mask_unpackhi_pd'. Requires AVX512F.

func M256MaskUnpackhiPs ¶

func M256MaskUnpackhiPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm256_mask_unpackhi_ps'. Requires AVX512F.

func M256MaskUnpackloEpi32 ¶

func M256MaskUnpackloEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm256_mask_unpacklo_epi32'. Requires AVX512F.

func M256MaskUnpackloEpi64 ¶

func M256MaskUnpackloEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm256_mask_unpacklo_epi64'. Requires AVX512F.

func M256MaskUnpackloPd ¶

func M256MaskUnpackloPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm256_mask_unpacklo_pd'. Requires AVX512F.

func M256MaskUnpackloPs ¶

func M256MaskUnpackloPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm256_mask_unpacklo_ps'. Requires AVX512F.

func M256MaskXorEpi32 ¶

func M256MaskXorEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm256_mask_xor_epi32'. Requires AVX512F.

func M256MaskXorEpi64 ¶

func M256MaskXorEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm256_mask_xor_epi64'. Requires AVX512F.

func M256MaskzAbsEpi32 ¶

func M256MaskzAbsEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm256_maskz_abs_epi32'. Requires AVX512F.

func M256MaskzAbsEpi64 ¶

func M256MaskzAbsEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm256_maskz_abs_epi64'. Requires AVX512F.

func M256MaskzAddEpi32 ¶

func M256MaskzAddEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm256_maskz_add_epi32'. Requires AVX512F.

func M256MaskzAddEpi64 ¶

func M256MaskzAddEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] :=0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm256_maskz_add_epi64'. Requires AVX512F.

func M256MaskzAndEpi32 ¶

func M256MaskzAndEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm256_maskz_and_epi32'. Requires AVX512F.

func M256MaskzAndEpi64 ¶

func M256MaskzAndEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm256_maskz_and_epi64'. Requires AVX512F.

func M256MaskzAndnotEpi32 ¶

func M256MaskzAndnotEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm256_maskz_andnot_epi32'. Requires AVX512F.

func M256MaskzAndnotEpi64 ¶

func M256MaskzAndnotEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm256_maskz_andnot_epi64'. Requires AVX512F.

func M256MaskzBroadcastF32x4 ¶

func M256MaskzBroadcastF32x4(k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskzBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm256_maskz_broadcast_f32x4'. Requires AVX512F.

func M256MaskzBroadcastI32x4 ¶

func M256MaskzBroadcastI32x4(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm256_maskz_broadcast_i32x4'. Requires AVX512F.

func M256MaskzBroadcastdEpi32 ¶

func M256MaskzBroadcastdEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_maskz_broadcastd_epi32'. Requires AVX512F.

func M256MaskzBroadcastqEpi64 ¶

func M256MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_maskz_broadcastq_epi64'. Requires AVX512F.

func M256MaskzBroadcastsdPd ¶

func M256MaskzBroadcastsdPd(k x86.Mmask8, a x86.M128d) (dst x86.M256d)

M256MaskzBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm256_maskz_broadcastsd_pd'. Requires AVX512F.

func M256MaskzBroadcastssPs ¶

func M256MaskzBroadcastssPs(k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskzBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm256_maskz_broadcastss_ps'. Requires AVX512F.

func M256MaskzCompressEpi32 ¶

func M256MaskzCompressEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm256_maskz_compress_epi32'. Requires AVX512F.

func M256MaskzCompressEpi64 ¶

func M256MaskzCompressEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm256_maskz_compress_epi64'. Requires AVX512F.

func M256MaskzCompressPd ¶

func M256MaskzCompressPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm256_maskz_compress_pd'. Requires AVX512F.

func M256MaskzCompressPs ¶

func M256MaskzCompressPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm256_maskz_compress_ps'. Requires AVX512F.

func M256MaskzCvtRoundpsPh ¶

func M256MaskzCvtRoundpsPh(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)

M256MaskzCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_maskz_cvt_roundps_ph'. Requires AVX512F.

func M256MaskzCvtepi16Epi32 ¶

func M256MaskzCvtepi16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm256_maskz_cvtepi16_epi32'. Requires AVX512F.

func M256MaskzCvtepi16Epi64 ¶

func M256MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi16Epi64: Sign extend packed 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm256_maskz_cvtepi16_epi64'. Requires AVX512F.

func M256MaskzCvtepi32Epi16 ¶

func M256MaskzCvtepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm256_maskz_cvtepi32_epi16'. Requires AVX512F.

func M256MaskzCvtepi32Epi64 ¶

func M256MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm256_maskz_cvtepi32_epi64'. Requires AVX512F.

func M256MaskzCvtepi32Epi8 ¶

func M256MaskzCvtepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm256_maskz_cvtepi32_epi8'. Requires AVX512F.

func M256MaskzCvtepi32Pd ¶

func M256MaskzCvtepi32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M256d)

M256MaskzCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm256_maskz_cvtepi32_pd'. Requires AVX512F.

func M256MaskzCvtepi32Ps ¶

func M256MaskzCvtepi32Ps(k x86.Mmask8, a x86.M256i) (dst x86.M256)

M256MaskzCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm256_maskz_cvtepi32_ps'. Requires AVX512F.

func M256MaskzCvtepi64Epi16 ¶

func M256MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm256_maskz_cvtepi64_epi16'. Requires AVX512F.

func M256MaskzCvtepi64Epi32 ¶

func M256MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm256_maskz_cvtepi64_epi32'. Requires AVX512F.

func M256MaskzCvtepi64Epi8 ¶

func M256MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm256_maskz_cvtepi64_epi8'. Requires AVX512F.

func M256MaskzCvtepi8Epi32 ¶

func M256MaskzCvtepi8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi8Epi32: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm256_maskz_cvtepi8_epi32'. Requires AVX512F.

func M256MaskzCvtepi8Epi64 ¶

func M256MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi8Epi64: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm256_maskz_cvtepi8_epi64'. Requires AVX512F.

func M256MaskzCvtepu16Epi32 ¶

func M256MaskzCvtepu16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm256_maskz_cvtepu16_epi32'. Requires AVX512F.

func M256MaskzCvtepu16Epi64 ¶

func M256MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm256_maskz_cvtepu16_epi64'. Requires AVX512F.

func M256MaskzCvtepu32Epi64 ¶

func M256MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm256_maskz_cvtepu32_epi64'. Requires AVX512F.

func M256MaskzCvtepu32Pd ¶

func M256MaskzCvtepu32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M256d)

M256MaskzCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm256_maskz_cvtepu32_pd'. Requires AVX512F.

func M256MaskzCvtepu8Epi32 ¶

func M256MaskzCvtepu8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm256_maskz_cvtepu8_epi32'. Requires AVX512F.

func M256MaskzCvtepu8Epi64 ¶

func M256MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm256_maskz_cvtepu8_epi64'. Requires AVX512F.

func M256MaskzCvtpdEpi32 ¶

func M256MaskzCvtpdEpi32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskzCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm256_maskz_cvtpd_epi32'. Requires AVX512F.

func M256MaskzCvtpdEpu32 ¶

func M256MaskzCvtpdEpu32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskzCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm256_maskz_cvtpd_epu32'. Requires AVX512F.

func M256MaskzCvtpdPs ¶

func M256MaskzCvtpdPs(k x86.Mmask8, a x86.M256d) (dst x86.M128)

M256MaskzCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm256_maskz_cvtpd_ps'. Requires AVX512F.

func M256MaskzCvtphPs ¶

func M256MaskzCvtphPs(k x86.Mmask8, a x86.M128i) (dst x86.M256)

M256MaskzCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm256_maskz_cvtph_ps'. Requires AVX512F.

func M256MaskzCvtpsEpi32 ¶

func M256MaskzCvtpsEpi32(k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskzCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm256_maskz_cvtps_epi32'. Requires AVX512F.

func M256MaskzCvtpsEpu32 ¶

func M256MaskzCvtpsEpu32(k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskzCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm256_maskz_cvtps_epu32'. Requires AVX512F.

func M256MaskzCvtpsPh ¶

func M256MaskzCvtpsPh(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)

M256MaskzCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_maskz_cvtps_ph'. Requires AVX512F.

func M256MaskzCvtsepi32Epi16 ¶

func M256MaskzCvtsepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm256_maskz_cvtsepi32_epi16'. Requires AVX512F.

func M256MaskzCvtsepi32Epi8 ¶

func M256MaskzCvtsepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm256_maskz_cvtsepi32_epi8'. Requires AVX512F.

func M256MaskzCvtsepi64Epi16 ¶

func M256MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm256_maskz_cvtsepi64_epi16'. Requires AVX512F.

func M256MaskzCvtsepi64Epi32 ¶

func M256MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm256_maskz_cvtsepi64_epi32'. Requires AVX512F.

func M256MaskzCvtsepi64Epi8 ¶

func M256MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm256_maskz_cvtsepi64_epi8'. Requires AVX512F.

func M256MaskzCvttpdEpi32 ¶

func M256MaskzCvttpdEpi32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskzCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm256_maskz_cvttpd_epi32'. Requires AVX512F.

func M256MaskzCvttpdEpu32 ¶

func M256MaskzCvttpdEpu32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskzCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm256_maskz_cvttpd_epu32'. Requires AVX512F.

func M256MaskzCvttpsEpi32 ¶

func M256MaskzCvttpsEpi32(k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskzCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*i
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm256_maskz_cvttps_epi32'. Requires AVX512F.

func M256MaskzCvttpsEpu32 ¶

func M256MaskzCvttpsEpu32(k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskzCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm256_maskz_cvttps_epu32'. Requires AVX512F.

func M256MaskzCvtusepi32Epi16 ¶

func M256MaskzCvtusepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm256_maskz_cvtusepi32_epi16'. Requires AVX512F.

func M256MaskzCvtusepi32Epi8 ¶

func M256MaskzCvtusepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm256_maskz_cvtusepi32_epi8'. Requires AVX512F.

func M256MaskzCvtusepi64Epi16 ¶

func M256MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm256_maskz_cvtusepi64_epi16'. Requires AVX512F.

func M256MaskzCvtusepi64Epi32 ¶

func M256MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm256_maskz_cvtusepi64_epi32'. Requires AVX512F.

func M256MaskzCvtusepi64Epi8 ¶

func M256MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm256_maskz_cvtusepi64_epi8'. Requires AVX512F.

func M256MaskzDivPd ¶

func M256MaskzDivPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm256_maskz_div_pd'. Requires AVX512F.

func M256MaskzDivPs ¶

func M256MaskzDivPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm256_maskz_div_ps'. Requires AVX512F.

func M256MaskzExpandEpi32 ¶

func M256MaskzExpandEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm256_maskz_expand_epi32'. Requires AVX512F.

func M256MaskzExpandEpi64 ¶

func M256MaskzExpandEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm256_maskz_expand_epi64'. Requires AVX512F.

func M256MaskzExpandPd ¶

func M256MaskzExpandPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm256_maskz_expand_pd'. Requires AVX512F.

func M256MaskzExpandPs ¶

func M256MaskzExpandPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm256_maskz_expand_ps'. Requires AVX512F.

func M256MaskzExtractf32x4Ps ¶

func M256MaskzExtractf32x4Ps(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M128)

M256MaskzExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm256_maskz_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzExtracti32x4Epi32 ¶

func M256MaskzExtracti32x4Epi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)

M256MaskzExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm256_maskz_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzFixupimmPd ¶

func M256MaskzFixupimmPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)

M256MaskzFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm256_maskz_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzFixupimmPs ¶

func M256MaskzFixupimmPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)

M256MaskzFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm256_maskz_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzFmaddPd ¶

func M256MaskzFmaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm256_maskz_fmadd_pd'. Requires AVX512F.

func M256MaskzFmaddPs ¶

func M256MaskzFmaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm256_maskz_fmadd_ps'. Requires AVX512F.

func M256MaskzFmaddsubPd ¶

func M256MaskzFmaddsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm256_maskz_fmaddsub_pd'. Requires AVX512F.

func M256MaskzFmaddsubPs ¶

func M256MaskzFmaddsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm256_maskz_fmaddsub_ps'. Requires AVX512F.

func M256MaskzFmsubPd ¶

func M256MaskzFmsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm256_maskz_fmsub_pd'. Requires AVX512F.

func M256MaskzFmsubPs ¶

func M256MaskzFmsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm256_maskz_fmsub_ps'. Requires AVX512F.

func M256MaskzFmsubaddPd ¶

func M256MaskzFmsubaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm256_maskz_fmsubadd_pd'. Requires AVX512F.

func M256MaskzFmsubaddPs ¶

func M256MaskzFmsubaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm256_maskz_fmsubadd_ps'. Requires AVX512F.

func M256MaskzFnmaddPd ¶

func M256MaskzFnmaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm256_maskz_fnmadd_pd'. Requires AVX512F.

func M256MaskzFnmaddPs ¶

func M256MaskzFnmaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm256_maskz_fnmadd_ps'. Requires AVX512F.

func M256MaskzFnmsubPd ¶

func M256MaskzFnmsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm256_maskz_fnmsub_pd'. Requires AVX512F.

func M256MaskzFnmsubPs ¶

func M256MaskzFnmsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm256_maskz_fnmsub_ps'. Requires AVX512F.

func M256MaskzGetexpPd ¶

func M256MaskzGetexpPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm256_maskz_getexp_pd'. Requires AVX512F.

func M256MaskzGetexpPs ¶

func M256MaskzGetexpPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm256_maskz_getexp_ps'. Requires AVX512F.

func M256MaskzGetmantPd ¶

func M256MaskzGetmantPd(k x86.Mmask8, a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)

M256MaskzGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm256_maskz_getmant_pd'. Requires AVX512F.

func M256MaskzGetmantPs ¶

func M256MaskzGetmantPs(k x86.Mmask8, a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)

M256MaskzGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 7
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm256_maskz_getmant_ps'. Requires AVX512F.

func M256MaskzInsertf32x4 ¶

func M256MaskzInsertf32x4(k x86.Mmask8, a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)

M256MaskzInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm256_maskz_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzInserti32x4 ¶

func M256MaskzInserti32x4(k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256MaskzInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm256_maskz_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzMaxEpi32 ¶

func M256MaskzMaxEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm256_maskz_max_epi32'. Requires AVX512F.

func M256MaskzMaxEpi64 ¶

func M256MaskzMaxEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm256_maskz_max_epi64'. Requires AVX512F.

func M256MaskzMaxEpu32 ¶

func M256MaskzMaxEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm256_maskz_max_epu32'. Requires AVX512F.

func M256MaskzMaxEpu64 ¶

func M256MaskzMaxEpu64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm256_maskz_max_epu64'. Requires AVX512F.

func M256MaskzMaxPd ¶

func M256MaskzMaxPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm256_maskz_max_pd'. Requires AVX512F.

func M256MaskzMaxPs ¶

func M256MaskzMaxPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm256_maskz_max_ps'. Requires AVX512F.

func M256MaskzMinEpi32 ¶

func M256MaskzMinEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm256_maskz_min_epi32'. Requires AVX512F.

func M256MaskzMinEpi64 ¶

func M256MaskzMinEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm256_maskz_min_epi64'. Requires AVX512F.

func M256MaskzMinEpu32 ¶

func M256MaskzMinEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm256_maskz_min_epu32'. Requires AVX512F.

func M256MaskzMinEpu64 ¶

func M256MaskzMinEpu64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm256_maskz_min_epu64'. Requires AVX512F.

func M256MaskzMinPd ¶

func M256MaskzMinPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm256_maskz_min_pd'. Requires AVX512F.

func M256MaskzMinPs ¶

func M256MaskzMinPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm256_maskz_min_ps'. Requires AVX512F.

func M256MaskzMovEpi32 ¶

func M256MaskzMovEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzMovEpi32: Move packed 32-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm256_maskz_mov_epi32'. Requires AVX512F.

func M256MaskzMovEpi64 ¶

func M256MaskzMovEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzMovEpi64: Move packed 64-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm256_maskz_mov_epi64'. Requires AVX512F.

func M256MaskzMovPd ¶

func M256MaskzMovPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm256_maskz_mov_pd'. Requires AVX512F.

func M256MaskzMovPs ¶

func M256MaskzMovPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm256_maskz_mov_ps'. Requires AVX512F.

func M256MaskzMovedupPd ¶

func M256MaskzMovedupPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm256_maskz_movedup_pd'. Requires AVX512F.

func M256MaskzMovehdupPs ¶

func M256MaskzMovehdupPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm256_maskz_movehdup_ps'. Requires AVX512F.

func M256MaskzMoveldupPs ¶

func M256MaskzMoveldupPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm256_maskz_moveldup_ps'. Requires AVX512F.

func M256MaskzMulEpi32 ¶

func M256MaskzMulEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm256_maskz_mul_epi32'. Requires AVX512F.

func M256MaskzMulEpu32 ¶

func M256MaskzMulEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm256_maskz_mul_epu32'. Requires AVX512F.

func M256MaskzMulPd ¶

func M256MaskzMulPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm256_maskz_mul_pd'. Requires AVX512F.

func M256MaskzMulPs ¶

func M256MaskzMulPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm256_maskz_mul_ps'. Requires AVX512F.

func M256MaskzMulloEpi32 ¶

func M256MaskzMulloEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm256_maskz_mullo_epi32'. Requires AVX512F.

func M256MaskzOrEpi32 ¶

func M256MaskzOrEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPORD'. Intrinsic: '_mm256_maskz_or_epi32'. Requires AVX512F.

func M256MaskzOrEpi64 ¶

func M256MaskzOrEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm256_maskz_or_epi64'. Requires AVX512F.

func M256MaskzPermutePd ¶

func M256MaskzPermutePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm256_maskz_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzPermutePs ¶

func M256MaskzPermutePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskzPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm256_maskz_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzPermutevarPd ¶

func M256MaskzPermutevarPd(k x86.Mmask8, a x86.M256d, b x86.M256i) (dst x86.M256d)

M256MaskzPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm256_maskz_permutevar_pd'. Requires AVX512F.

func M256MaskzPermutevarPs ¶

func M256MaskzPermutevarPs(k x86.Mmask8, a x86.M256, b x86.M256i) (dst x86.M256)

M256MaskzPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm256_maskz_permutevar_ps'. Requires AVX512F.

func M256MaskzPermutex2varEpi32 ¶

func M256MaskzPermutex2varEpi32(k x86.Mmask8, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm256_maskz_permutex2var_epi32'. Requires AVX512F.

func M256MaskzPermutex2varEpi64 ¶

func M256MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm256_maskz_permutex2var_epi64'. Requires AVX512F.

func M256MaskzPermutex2varPd ¶

func M256MaskzPermutex2varPd(k x86.Mmask8, a x86.M256d, idx x86.M256i, b x86.M256d) (dst x86.M256d)

M256MaskzPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm256_maskz_permutex2var_pd'. Requires AVX512F.

func M256MaskzPermutex2varPs ¶

func M256MaskzPermutex2varPs(k x86.Mmask8, a x86.M256, idx x86.M256i, b x86.M256) (dst x86.M256)

M256MaskzPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm256_maskz_permutex2var_ps'. Requires AVX512F.

func M256MaskzPermutexEpi64 ¶

func M256MaskzPermutexEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzPermutexEpi64: Shuffle 64-bit integers in 'a' across lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_maskz_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzPermutexPd ¶

func M256MaskzPermutexPd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_maskz_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzPermutexvarEpi32 ¶

func M256MaskzPermutexvarEpi32(k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskzPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm256_maskz_permutexvar_epi32'. Requires AVX512F.

func M256MaskzPermutexvarEpi64 ¶

func M256MaskzPermutexvarEpi64(k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskzPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_maskz_permutexvar_epi64'. Requires AVX512F.

func M256MaskzPermutexvarPd ¶

func M256MaskzPermutexvarPd(k x86.Mmask8, idx x86.M256i, a x86.M256d) (dst x86.M256d)

M256MaskzPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_maskz_permutexvar_pd'. Requires AVX512F.

func M256MaskzPermutexvarPs ¶

func M256MaskzPermutexvarPs(k x86.Mmask8, idx x86.M256i, a x86.M256) (dst x86.M256)

M256MaskzPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm256_maskz_permutexvar_ps'. Requires AVX512F.

func M256MaskzRcp14Pd ¶

func M256MaskzRcp14Pd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm256_maskz_rcp14_pd'. Requires AVX512F.

func M256MaskzRcp14Ps ¶

func M256MaskzRcp14Ps(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm256_maskz_rcp14_ps'. Requires AVX512F.

func M256MaskzRolEpi32 ¶

func M256MaskzRolEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm256_maskz_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRolEpi64 ¶

func M256MaskzRolEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm256_maskz_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRolvEpi32 ¶

func M256MaskzRolvEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm256_maskz_rolv_epi32'. Requires AVX512F.

func M256MaskzRolvEpi64 ¶

func M256MaskzRolvEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm256_maskz_rolv_epi64'. Requires AVX512F.

func M256MaskzRorEpi32 ¶

func M256MaskzRorEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm256_maskz_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRorEpi64 ¶

func M256MaskzRorEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm256_maskz_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRorvEpi32 ¶

func M256MaskzRorvEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm256_maskz_rorv_epi32'. Requires AVX512F.

func M256MaskzRorvEpi64 ¶

func M256MaskzRorvEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm256_maskz_rorv_epi64'. Requires AVX512F.

func M256MaskzRoundscalePd ¶

func M256MaskzRoundscalePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm256_maskz_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRoundscalePs ¶

func M256MaskzRoundscalePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskzRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm256_maskz_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRsqrt14Pd ¶

func M256MaskzRsqrt14Pd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm256_maskz_rsqrt14_pd'. Requires AVX512F.

func M256MaskzRsqrt14Ps ¶

func M256MaskzRsqrt14Ps(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm256_maskz_rsqrt14_ps'. Requires AVX512F.

func M256MaskzScalefPd ¶

func M256MaskzScalefPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm256_maskz_scalef_pd'. Requires AVX512F.

func M256MaskzScalefPs ¶

func M256MaskzScalefPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm256_maskz_scalef_ps'. Requires AVX512F.

func M256MaskzSet1Epi32 ¶

func M256MaskzSet1Epi32(k x86.Mmask8, a int) (dst x86.M256i)

M256MaskzSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_maskz_set1_epi32'. Requires AVX512F.

func M256MaskzSet1Epi64 ¶

func M256MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M256i)

M256MaskzSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_maskz_set1_epi64'. Requires AVX512F.

func M256MaskzShuffleEpi32 ¶

func M256MaskzShuffleEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm256_maskz_shuffle_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShuffleF32x4 ¶

func M256MaskzShuffleF32x4(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskzShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm256_maskz_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShuffleF64x2 ¶

func M256MaskzShuffleF64x2(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm256_maskz_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShuffleI32x4 ¶

func M256MaskzShuffleI32x4(k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm256_maskz_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShuffleI64x2 ¶

func M256MaskzShuffleI64x2(k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm256_maskz_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShufflePd ¶

func M256MaskzShufflePd(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm256_maskz_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShufflePs ¶

func M256MaskzShufflePs(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskzShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm256_maskz_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSllEpi32 ¶

func M256MaskzSllEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm256_maskz_sll_epi32'. Requires AVX512F.

func M256MaskzSllEpi64 ¶

func M256MaskzSllEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm256_maskz_sll_epi64'. Requires AVX512F.

func M256MaskzSlliEpi32 ¶

func M256MaskzSlliEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm256_maskz_slli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSlliEpi64 ¶

func M256MaskzSlliEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm256_maskz_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSllvEpi32 ¶

func M256MaskzSllvEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm256_maskz_sllv_epi32'. Requires AVX512F.

func M256MaskzSllvEpi64 ¶

func M256MaskzSllvEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm256_maskz_sllv_epi64'. Requires AVX512F.

func M256MaskzSqrtPd ¶

func M256MaskzSqrtPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm256_maskz_sqrt_pd'. Requires AVX512F.

func M256MaskzSqrtPs ¶

func M256MaskzSqrtPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm256_maskz_sqrt_ps'. Requires AVX512F.

func M256MaskzSraEpi32 ¶

func M256MaskzSraEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm256_maskz_sra_epi32'. Requires AVX512F.

func M256MaskzSraEpi64 ¶

func M256MaskzSraEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_maskz_sra_epi64'. Requires AVX512F.

func M256MaskzSraiEpi32 ¶

func M256MaskzSraiEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm256_maskz_srai_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSraiEpi64 ¶

func M256MaskzSraiEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_maskz_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSravEpi32 ¶

func M256MaskzSravEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm256_maskz_srav_epi32'. Requires AVX512F.

func M256MaskzSravEpi64 ¶

func M256MaskzSravEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm256_maskz_srav_epi64'. Requires AVX512F.

func M256MaskzSrlEpi32 ¶

func M256MaskzSrlEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm256_maskz_srl_epi32'. Requires AVX512F.

func M256MaskzSrlEpi64 ¶

func M256MaskzSrlEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm256_maskz_srl_epi64'. Requires AVX512F.

func M256MaskzSrliEpi32 ¶

func M256MaskzSrliEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm256_maskz_srli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSrliEpi64 ¶

func M256MaskzSrliEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm256_maskz_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSrlvEpi32 ¶

func M256MaskzSrlvEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm256_maskz_srlv_epi32'. Requires AVX512F.

func M256MaskzSrlvEpi64 ¶

func M256MaskzSrlvEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm256_maskz_srlv_epi64'. Requires AVX512F.

func M256MaskzSubEpi32 ¶

func M256MaskzSubEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm256_maskz_sub_epi32'. Requires AVX512F.

func M256MaskzSubEpi64 ¶

func M256MaskzSubEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm256_maskz_sub_epi64'. Requires AVX512F.

func M256MaskzSubPd ¶

func M256MaskzSubPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm256_maskz_sub_pd'. Requires AVX512F.

func M256MaskzSubPs ¶

func M256MaskzSubPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm256_maskz_sub_ps'. Requires AVX512F.

func M256MaskzTernarylogicEpi32 ¶

func M256MaskzTernarylogicEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm256_maskz_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzTernarylogicEpi64 ¶

func M256MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm256_maskz_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzUnpackhiEpi32 ¶

func M256MaskzUnpackhiEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm256_maskz_unpackhi_epi32'. Requires AVX512F.

func M256MaskzUnpackhiEpi64 ¶

func M256MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm256_maskz_unpackhi_epi64'. Requires AVX512F.

func M256MaskzUnpackhiPd ¶

func M256MaskzUnpackhiPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm256_maskz_unpackhi_pd'. Requires AVX512F.

func M256MaskzUnpackhiPs ¶

func M256MaskzUnpackhiPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm256_maskz_unpackhi_ps'. Requires AVX512F.

func M256MaskzUnpackloEpi32 ¶

func M256MaskzUnpackloEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm256_maskz_unpacklo_epi32'. Requires AVX512F.

func M256MaskzUnpackloEpi64 ¶

func M256MaskzUnpackloEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm256_maskz_unpacklo_epi64'. Requires AVX512F.

func M256MaskzUnpackloPd ¶

func M256MaskzUnpackloPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm256_maskz_unpacklo_pd'. Requires AVX512F.

func M256MaskzUnpackloPs ¶

func M256MaskzUnpackloPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm256_maskz_unpacklo_ps'. Requires AVX512F.

func M256MaskzXorEpi32 ¶

func M256MaskzXorEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm256_maskz_xor_epi32'. Requires AVX512F.

func M256MaskzXorEpi64 ¶

func M256MaskzXorEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm256_maskz_xor_epi64'. Requires AVX512F.

func M256MaxEpi64 ¶

func M256MaxEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm256_max_epi64'. Requires AVX512F.

func M256MaxEpu64 ¶

func M256MaxEpu64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm256_max_epu64'. Requires AVX512F.

func M256MinEpi64 ¶

func M256MinEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm256_min_epi64'. Requires AVX512F.

func M256MinEpu64 ¶

func M256MinEpu64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm256_min_epu64'. Requires AVX512F.

func M256Permutex2varEpi32 ¶

func M256Permutex2varEpi32(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm256_permutex2var_epi32'. Requires AVX512F.

func M256Permutex2varEpi64 ¶

func M256Permutex2varEpi64(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm256_permutex2var_epi64'. Requires AVX512F.

func M256Permutex2varPd ¶

func M256Permutex2varPd(a x86.M256d, idx x86.M256i, b x86.M256d) (dst x86.M256d)

M256Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm256_permutex2var_pd'. Requires AVX512F.

func M256Permutex2varPs ¶

func M256Permutex2varPs(a x86.M256, idx x86.M256i, b x86.M256) (dst x86.M256)

M256Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm256_permutex2var_ps'. Requires AVX512F.

func M256PermutexEpi64 ¶

func M256PermutexEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256PermutexEpi64: Shuffle 64-bit integers in 'a' across lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256PermutexPd ¶

func M256PermutexPd(a x86.M256d, imm8 byte) (dst x86.M256d)

M256PermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256PermutexvarEpi32 ¶

func M256PermutexvarEpi32(idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256PermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm256_permutexvar_epi32'. Requires AVX512F.

func M256PermutexvarEpi64 ¶

func M256PermutexvarEpi64(idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256PermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_permutexvar_epi64'. Requires AVX512F.

func M256PermutexvarPd ¶

func M256PermutexvarPd(idx x86.M256i, a x86.M256d) (dst x86.M256d)

M256PermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_permutexvar_pd'. Requires AVX512F.

func M256PermutexvarPs ¶

func M256PermutexvarPs(idx x86.M256i, a x86.M256) (dst x86.M256)

M256PermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx'.

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm256_permutexvar_ps'. Requires AVX512F.

func M256Rcp14Pd ¶

func M256Rcp14Pd(a x86.M256d) (dst x86.M256d)

M256Rcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm256_rcp14_pd'. Requires AVX512F.

func M256Rcp14Ps ¶

func M256Rcp14Ps(a x86.M256) (dst x86.M256)

M256Rcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm256_rcp14_ps'. Requires AVX512F.

func M256RolEpi32 ¶

func M256RolEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)

M256RolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm256_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256RolEpi64 ¶

func M256RolEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256RolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm256_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256RolvEpi32 ¶

func M256RolvEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256RolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm256_rolv_epi32'. Requires AVX512F.

func M256RolvEpi64 ¶

func M256RolvEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256RolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm256_rolv_epi64'. Requires AVX512F.

func M256RorEpi32 ¶

func M256RorEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)

M256RorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm256_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256RorEpi64 ¶

func M256RorEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256RorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm256_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256RorvEpi32 ¶

func M256RorvEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256RorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm256_rorv_epi32'. Requires AVX512F.

func M256RorvEpi64 ¶

func M256RorvEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256RorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm256_rorv_epi64'. Requires AVX512F.

func M256RoundscalePd ¶

func M256RoundscalePd(a x86.M256d, imm8 byte) (dst x86.M256d)

M256RoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm256_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256RoundscalePs ¶

func M256RoundscalePs(a x86.M256, imm8 byte) (dst x86.M256)

M256RoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm256_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256ScalefPd ¶

func M256ScalefPd(a x86.M256d, b x86.M256d) (dst x86.M256d)

M256ScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm256_scalef_pd'. Requires AVX512F.

func M256ScalefPs ¶

func M256ScalefPs(a x86.M256, b x86.M256) (dst x86.M256)

M256ScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm256_scalef_ps'. Requires AVX512F.

func M256ShuffleF32x4 ¶

func M256ShuffleF32x4(a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256ShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT2(a[255:0], imm8[0])
dst[255:128] := SELECT2(b[255:0], imm8[1])
dst[MAX:256] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm256_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256ShuffleF64x2 ¶

func M256ShuffleF64x2(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256ShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT2(a[255:0], imm8[0])
dst[255:128] := SELECT2(b[255:0], imm8[1])
dst[MAX:256] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm256_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256ShuffleI32x4 ¶

func M256ShuffleI32x4(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256ShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT2(a[255:0], imm8[1:0])
dst[255:128] := SELECT2(b[255:0], imm8[3:2])
dst[MAX:256] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm256_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256ShuffleI64x2 ¶

func M256ShuffleI64x2(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256ShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT2(a[255:0], imm8[1:0])
dst[255:128] := SELECT2(b[255:0], imm8[3:2])
dst[MAX:256] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm256_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256SraEpi64 ¶

func M256SraEpi64(a x86.M256i, count x86.M128i) (dst x86.M256i)

M256SraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_sra_epi64'. Requires AVX512F.

func M256SraiEpi64 ¶

func M256SraiEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256SravEpi64 ¶

func M256SravEpi64(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm256_srav_epi64'. Requires AVX512F.

func M256TernarylogicEpi32 ¶

func M256TernarylogicEpi32(a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)

M256TernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 7
	i := j*32
	FOR h := 0 to 31
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm256_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256TernarylogicEpi64 ¶

func M256TernarylogicEpi64(a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)

M256TernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 3
	i := j*64
	FOR h := 0 to 63
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm256_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256TestEpi32Mask ¶

func M256TestEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256TestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*32
	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMD'. Intrinsic: '_mm256_test_epi32_mask'. Requires AVX512F.

func M256TestEpi64Mask ¶

func M256TestEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256TestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 3
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm256_test_epi64_mask'. Requires AVX512F.

func M256TestnEpi32Mask ¶

func M256TestnEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256TestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 7
	i := j*32
	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm256_testn_epi32_mask'. Requires AVX512F.

func M256TestnEpi64Mask ¶

func M256TestnEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256TestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 3
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm256_testn_epi64_mask'. Requires AVX512F.

func M512AbsEpi32 ¶

func M512AbsEpi32(a x86.M512i) (dst x86.M512i)

M512AbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm512_abs_epi32'. Requires AVX512F.

func M512AbsEpi64 ¶

func M512AbsEpi64(a x86.M512i) (dst x86.M512i)

M512AbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm512_abs_epi64'. Requires AVX512F.

func M512AcosPd ¶

func M512AcosPd(a x86.M512d) (dst x86.M512d)

M512AcosPd: Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_acos_pd'. Requires AVX512F.

func M512AcosPs ¶

func M512AcosPs(a x86.M512) (dst x86.M512)

M512AcosPs: Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_acos_ps'. Requires AVX512F.

func M512AcoshPd ¶

func M512AcoshPd(a x86.M512d) (dst x86.M512d)

M512AcoshPd: Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_acosh_pd'. Requires AVX512F.

func M512AcoshPs ¶

func M512AcoshPs(a x86.M512) (dst x86.M512)

M512AcoshPs: Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_acosh_ps'. Requires AVX512F.

func M512AddEpi64 ¶

func M512AddEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm512_add_epi64'. Requires AVX512F.

func M512AlignrEpi64 ¶

func M512AlignrEpi64(a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512AlignrEpi64: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 64-bit elements, and store the low 64 bytes (8 elements) in 'dst'.

temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (64*count)
dst[511:0] := temp[511:0]
dst[MAX:512] := 0

Instruction: 'VALIGNQ'. Intrinsic: '_mm512_alignr_epi64'. Requires AVX512F.

func M512AsinPd ¶

func M512AsinPd(a x86.M512d) (dst x86.M512d)

M512AsinPd: Compute the inverse sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ASIN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_asin_pd'. Requires AVX512F.

func M512AsinPs ¶

func M512AsinPs(a x86.M512) (dst x86.M512)

M512AsinPs: Compute the inverse sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ASIN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_asin_ps'. Requires AVX512F.

func M512AsinhPd ¶

func M512AsinhPd(a x86.M512d) (dst x86.M512d)

M512AsinhPd: Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ASINH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_asinh_pd'. Requires AVX512F.

func M512AsinhPs ¶

func M512AsinhPs(a x86.M512) (dst x86.M512)

M512AsinhPs: Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ASINH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_asinh_ps'. Requires AVX512F.

func M512Atan2Pd ¶

func M512Atan2Pd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512Atan2Pd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atan2_pd'. Requires AVX512F.

func M512Atan2Ps ¶

func M512Atan2Ps(a x86.M512, b x86.M512) (dst x86.M512)

M512Atan2Ps: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atan2_ps'. Requires AVX512F.

func M512AtanPd ¶

func M512AtanPd(a x86.M512d) (dst x86.M512d)

M512AtanPd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' and store the results in 'dst' expressed in radians.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ATAN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atan_pd'. Requires AVX512F.

func M512AtanPs ¶

func M512AtanPs(a x86.M512) (dst x86.M512)

M512AtanPs: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ATAN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atan_ps'. Requires AVX512F.

func M512AtanhPd ¶

func M512AtanhPd(a x86.M512d) (dst x86.M512d)

M512AtanhPd: Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' and store the results in 'dst' expressed in radians.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ATANH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atanh_pd'. Requires AVX512F.

func M512AtanhPs ¶

func M512AtanhPs(a x86.M512) (dst x86.M512)

M512AtanhPs: Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ATANH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atanh_ps'. Requires AVX512F.

func M512BroadcastF32x4 ¶

func M512BroadcastF32x4(a x86.M128) (dst x86.M512)

M512BroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm512_broadcast_f32x4'. Requires AVX512F.

func M512BroadcastF64x4 ¶

func M512BroadcastF64x4(a x86.M256d) (dst x86.M512d)

M512BroadcastF64x4: Broadcast the 4 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X4'. Intrinsic: '_mm512_broadcast_f64x4'. Requires AVX512F.

func M512BroadcastI32x4 ¶

func M512BroadcastI32x4(a x86.M128i) (dst x86.M512i)

M512BroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm512_broadcast_i32x4'. Requires AVX512F.

func M512BroadcastI64x4 ¶

func M512BroadcastI64x4(a x86.M256i) (dst x86.M512i)

M512BroadcastI64x4: Broadcast the 4 packed 64-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X4'. Intrinsic: '_mm512_broadcast_i64x4'. Requires AVX512F.

func M512BroadcastdEpi32 ¶

func M512BroadcastdEpi32(a x86.M128i) (dst x86.M512i)

M512BroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_broadcastd_epi32'. Requires AVX512F.

func M512BroadcastqEpi64 ¶

func M512BroadcastqEpi64(a x86.M128i) (dst x86.M512i)

M512BroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_broadcastq_epi64'. Requires AVX512F.

func M512BroadcastsdPd ¶

func M512BroadcastsdPd(a x86.M128d) (dst x86.M512d)

M512BroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm512_broadcastsd_pd'. Requires AVX512F.

func M512BroadcastssPs ¶

func M512BroadcastssPs(a x86.M128) (dst x86.M512)

M512BroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm512_broadcastss_ps'. Requires AVX512F.

func M512Castpd128Pd512 ¶

func M512Castpd128Pd512(a x86.M128d) (dst x86.M512d)

M512Castpd128Pd512: Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castpd128_pd512'. Requires AVX512F.

func M512Castpd256Pd512 ¶

func M512Castpd256Pd512(a x86.M256d) (dst x86.M512d)

M512Castpd256Pd512: Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castpd256_pd512'. Requires AVX512F.

func M512Castpd512Pd128 ¶

func M512Castpd512Pd128(a x86.M512d) (dst x86.M128d)

M512Castpd512Pd128: Cast vector of type __m512d to type __m128d.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castpd512_pd128'. Requires AVX512F.

func M512Castpd512Pd256 ¶

func M512Castpd512Pd256(a x86.M512d) (dst x86.M256d)

M512Castpd512Pd256: Cast vector of type __m512d to type __m256d.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castpd512_pd256'. Requires AVX512F.

func M512Castps128Ps512 ¶

func M512Castps128Ps512(a x86.M128) (dst x86.M512)

M512Castps128Ps512: Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castps128_ps512'. Requires AVX512F.

func M512Castps256Ps512 ¶

func M512Castps256Ps512(a x86.M256) (dst x86.M512)

M512Castps256Ps512: Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castps256_ps512'. Requires AVX512F.

func M512Castps512Ps128 ¶

func M512Castps512Ps128(a x86.M512) (dst x86.M128)

M512Castps512Ps128: Cast vector of type __m512 to type __m128.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castps512_ps128'. Requires AVX512F.

func M512Castps512Ps256 ¶

func M512Castps512Ps256(a x86.M512) (dst x86.M256)

M512Castps512Ps256: Cast vector of type __m512 to type __m256.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castps512_ps256'. Requires AVX512F.

func M512Castsi128Si512 ¶

func M512Castsi128Si512(a x86.M128i) (dst x86.M512i)

M512Castsi128Si512: Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castsi128_si512'. Requires AVX512F.

func M512Castsi256Si512 ¶

func M512Castsi256Si512(a x86.M256i) (dst x86.M512i)

M512Castsi256Si512: Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castsi256_si512'. Requires AVX512F.

func M512Castsi512Si128 ¶

func M512Castsi512Si128(a x86.M512i) (dst x86.M128i)

M512Castsi512Si128: Cast vector of type __m512i to type __m128i.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castsi512_si128'. Requires AVX512F.

func M512Castsi512Si256 ¶

func M512Castsi512Si256(a x86.M512i) (dst x86.M256i)

M512Castsi512Si256: Cast vector of type __m512i to type __m256i.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castsi512_si256'. Requires AVX512F.

func M512CbrtPd ¶

func M512CbrtPd(a x86.M512d) (dst x86.M512d)

M512CbrtPd: Compute the cube root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := CubeRoot(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cbrt_pd'. Requires AVX512F.

func M512CbrtPs ¶

func M512CbrtPs(a x86.M512) (dst x86.M512)

M512CbrtPs: Compute the cube root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := CubeRoot(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cbrt_ps'. Requires AVX512F.

func M512CdfnormPd ¶

func M512CdfnormPd(a x86.M512d) (dst x86.M512d)

M512CdfnormPd: Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := CDFNormal(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cdfnorm_pd'. Requires AVX512F.

func M512CdfnormPs ¶

func M512CdfnormPs(a x86.M512) (dst x86.M512)

M512CdfnormPs: Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := CDFNormal(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cdfnorm_ps'. Requires AVX512F.

func M512CdfnorminvPd ¶

func M512CdfnorminvPd(a x86.M512d) (dst x86.M512d)

M512CdfnorminvPd: Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cdfnorminv_pd'. Requires AVX512F.

func M512CdfnorminvPs ¶

func M512CdfnorminvPs(a x86.M512) (dst x86.M512)

M512CdfnorminvPs: Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cdfnorminv_ps'. Requires AVX512F.

func M512CeilPd ¶

func M512CeilPd(a x86.M512d) (dst x86.M512d)

M512CeilPd: Round the packed double-precision (64-bit) floating-point elements in 'a' up to an integer value, and store the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_ceil_pd'. Requires AVX512F.

func M512CeilPs ¶

func M512CeilPs(a x86.M512) (dst x86.M512)

M512CeilPs: Round the packed single-precision (32-bit) floating-point elements in 'a' up to an integer value, and store the results as packed single-precision floating-point elements in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_ceil_ps'. Requires AVX512F.

func M512CmpEpi64Mask ¶

func M512CmpEpi64Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)

M512CmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512CmpEpu64Mask ¶

func M512CmpEpu64Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)

M512CmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512CmpeqEpi64Mask ¶

func M512CmpeqEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPEQQ'. Intrinsic: '_mm512_cmpeq_epi64_mask'. Requires AVX512F.

func M512CmpeqEpu64Mask ¶

func M512CmpeqEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpeq_epu64_mask'. Requires AVX512F.

func M512CmpgeEpi64Mask ¶

func M512CmpgeEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmpge_epi64_mask'. Requires AVX512F.

func M512CmpgeEpu64Mask ¶

func M512CmpgeEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpge_epu64_mask'. Requires AVX512F.

func M512CmpgtEpi64Mask ¶

func M512CmpgtEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPGTQ'. Intrinsic: '_mm512_cmpgt_epi64_mask'. Requires AVX512F.

func M512CmpgtEpu64Mask ¶

func M512CmpgtEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpgt_epu64_mask'. Requires AVX512F.

func M512CmpleEpi64Mask ¶

func M512CmpleEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmple_epi64_mask'. Requires AVX512F.

func M512CmpleEpu64Mask ¶

func M512CmpleEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmple_epu64_mask'. Requires AVX512F.

func M512CmpltEpi32Mask ¶

func M512CmpltEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_cmplt_epi32_mask'. Requires AVX512F.

func M512CmpltEpi64Mask ¶

func M512CmpltEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmplt_epi64_mask'. Requires AVX512F.

func M512CmpltEpu64Mask ¶

func M512CmpltEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmplt_epu64_mask'. Requires AVX512F.

func M512CmpneqEpi64Mask ¶

func M512CmpneqEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmpneq_epi64_mask'. Requires AVX512F.

func M512CmpneqEpu64Mask ¶

func M512CmpneqEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpneq_epu64_mask'. Requires AVX512F.

func M512CosPd ¶

func M512CosPd(a x86.M512d) (dst x86.M512d)

M512CosPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cos_pd'. Requires AVX512F.

func M512CosPs ¶

func M512CosPs(a x86.M512) (dst x86.M512)

M512CosPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cos_ps'. Requires AVX512F.

func M512CosdPd ¶

func M512CosdPd(a x86.M512d) (dst x86.M512d)

M512CosdPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := COSD(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cosd_pd'. Requires AVX512F.

func M512CosdPs ¶

func M512CosdPs(a x86.M512) (dst x86.M512)

M512CosdPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := COSD(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cosd_ps'. Requires AVX512F.

func M512CoshPd ¶

func M512CoshPd(a x86.M512d) (dst x86.M512d)

M512CoshPd: Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := COSH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cosh_pd'. Requires AVX512F.

func M512CoshPs ¶

func M512CoshPs(a x86.M512) (dst x86.M512)

M512CoshPs: Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := COSH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cosh_ps'. Requires AVX512F.

func M512CvtRoundepi32Ps ¶

func M512CvtRoundepi32Ps(a x86.M512i, rounding int) (dst x86.M512)

M512CvtRoundepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_cvt_roundepi32_ps'. Requires AVX512F.

func M512CvtRoundepu32Ps ¶

func M512CvtRoundepu32Ps(a x86.M512i, rounding int) (dst x86.M512)

M512CvtRoundepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_cvt_roundepu32_ps'. Requires AVX512F.

func M512CvtRoundpdEpi32 ¶

func M512CvtRoundpdEpi32(a x86.M512d, rounding int) (dst x86.M256i)

M512CvtRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 32*j
			k := 64*j
			dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_cvt_roundpd_epi32'. Requires AVX512F.

func M512CvtRoundpdEpu32 ¶

func M512CvtRoundpdEpu32(a x86.M512d, rounding int) (dst x86.M256i)

M512CvtRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 32*j
			k := 64*j
			dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_cvt_roundpd_epu32'. Requires AVX512F.

func M512CvtRoundpdPs ¶

func M512CvtRoundpdPs(a x86.M512d, rounding int) (dst x86.M256)

M512CvtRoundpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 32*j
			k := 64*j
			dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_cvt_roundpd_ps'. Requires AVX512F.

func M512CvtRoundphPs ¶

func M512CvtRoundphPs(a x86.M256i, sae int) (dst x86.M512)

M512CvtRoundphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		m := j*16
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_cvt_roundph_ps'. Requires AVX512F.

func M512CvtRoundpsEpi32 ¶

func M512CvtRoundpsEpi32(a x86.M512, rounding int) (dst x86.M512i)

M512CvtRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_cvt_roundps_epi32'. Requires AVX512F.

func M512CvtRoundpsEpu32 ¶

func M512CvtRoundpsEpu32(a x86.M512, rounding int) (dst x86.M512i)

M512CvtRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_cvt_roundps_epu32'. Requires AVX512F.

func M512CvtRoundpsPd ¶

func M512CvtRoundpsPd(a x86.M256, sae int) (dst x86.M512d)

M512CvtRoundpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 64*j
		k := 32*j
		dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_cvt_roundps_pd'. Requires AVX512F.

func M512CvtRoundpsPh ¶

func M512CvtRoundpsPh(a x86.M512, rounding int) (dst x86.M256i)

M512CvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_cvt_roundps_ph'. Requires AVX512F.

func M512Cvtepi16Epi32 ¶

func M512Cvtepi16Epi32(a x86.M256i) (dst x86.M512i)

M512Cvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 16*j
	dst[i+31:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm512_cvtepi16_epi32'. Requires AVX512F.

func M512Cvtepi16Epi64 ¶

func M512Cvtepi16Epi64(a x86.M128i) (dst x86.M512i)

M512Cvtepi16Epi64: Sign extend packed 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 16*j
	dst[i+63:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm512_cvtepi16_epi64'. Requires AVX512F.

func M512Cvtepi32Epi16 ¶

func M512Cvtepi32Epi16(a x86.M512i) (dst x86.M256i)

M512Cvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 16*j
	dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm512_cvtepi32_epi16'. Requires AVX512F.

func M512Cvtepi32Epi64 ¶

func M512Cvtepi32Epi64(a x86.M256i) (dst x86.M512i)

M512Cvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[i+63:i] := SignExtend(a[k+31:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm512_cvtepi32_epi64'. Requires AVX512F.

func M512Cvtepi32Epi8 ¶

func M512Cvtepi32Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 8*j
	dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm512_cvtepi32_epi8'. Requires AVX512F.

func M512Cvtepi32Pd ¶

func M512Cvtepi32Pd(a x86.M256i) (dst x86.M512d)

M512Cvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	m := j*64
	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_cvtepi32_pd'. Requires AVX512F.

func M512Cvtepi32Ps ¶

func M512Cvtepi32Ps(a x86.M512i) (dst x86.M512)

M512Cvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_cvtepi32_ps'. Requires AVX512F.

func M512Cvtepi64Epi16 ¶

func M512Cvtepi64Epi16(a x86.M512i) (dst x86.M128i)

M512Cvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 16*j
	dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm512_cvtepi64_epi16'. Requires AVX512F.

func M512Cvtepi64Epi32 ¶

func M512Cvtepi64Epi32(a x86.M512i) (dst x86.M256i)

M512Cvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm512_cvtepi64_epi32'. Requires AVX512F.

func M512Cvtepi64Epi8 ¶

func M512Cvtepi64Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 8*j
	dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm512_cvtepi64_epi8'. Requires AVX512F.

func M512Cvtepi8Epi32 ¶

func M512Cvtepi8Epi32(a x86.M128i) (dst x86.M512i)

M512Cvtepi8Epi32: Sign extend packed 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 8*j
	dst[i+31:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm512_cvtepi8_epi32'. Requires AVX512F.

func M512Cvtepi8Epi64 ¶

func M512Cvtepi8Epi64(a x86.M128i) (dst x86.M512i)

M512Cvtepi8Epi64: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 8*j
	dst[i+63:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm512_cvtepi8_epi64'. Requires AVX512F.

func M512Cvtepu16Epi32 ¶

func M512Cvtepu16Epi32(a x86.M256i) (dst x86.M512i)

M512Cvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 16*j
	dst[i+31:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm512_cvtepu16_epi32'. Requires AVX512F.

func M512Cvtepu16Epi64 ¶

func M512Cvtepu16Epi64(a x86.M128i) (dst x86.M512i)

M512Cvtepu16Epi64: Zero extend packed unsigned 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 16*j
	dst[i+63:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm512_cvtepu16_epi64'. Requires AVX512F.

func M512Cvtepu32Epi64 ¶

func M512Cvtepu32Epi64(a x86.M256i) (dst x86.M512i)

M512Cvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[i+63:i] := ZeroExtend(a[k+31:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm512_cvtepu32_epi64'. Requires AVX512F.

func M512Cvtepu32Pd ¶

func M512Cvtepu32Pd(a x86.M256i) (dst x86.M512d)

M512Cvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_cvtepu32_pd'. Requires AVX512F.

func M512Cvtepu32Ps ¶

func M512Cvtepu32Ps(a x86.M512i) (dst x86.M512)

M512Cvtepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_cvtepu32_ps'. Requires AVX512F.

func M512Cvtepu8Epi32 ¶

func M512Cvtepu8Epi32(a x86.M128i) (dst x86.M512i)

M512Cvtepu8Epi32: Zero extend packed unsigned 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 8*j
	dst[i+31:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm512_cvtepu8_epi32'. Requires AVX512F.

func M512Cvtepu8Epi64 ¶

func M512Cvtepu8Epi64(a x86.M128i) (dst x86.M512i)

M512Cvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 8 byte sof 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 8*j
	dst[i+63:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm512_cvtepu8_epi64'. Requires AVX512F.

func M512CvtpdEpi32 ¶

func M512CvtpdEpi32(a x86.M512d) (dst x86.M256i)

M512CvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_cvtpd_epi32'. Requires AVX512F.

func M512CvtpdEpu32 ¶

func M512CvtpdEpu32(a x86.M512d) (dst x86.M256i)

M512CvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_cvtpd_epu32'. Requires AVX512F.

func M512CvtpdPs ¶

func M512CvtpdPs(a x86.M512d) (dst x86.M256)

M512CvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_cvtpd_ps'. Requires AVX512F.

func M512CvtphPs ¶

func M512CvtphPs(a x86.M256i) (dst x86.M512)

M512CvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	m := j*16
	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_cvtph_ps'. Requires AVX512F.

func M512CvtpsEpi32 ¶

func M512CvtpsEpi32(a x86.M512) (dst x86.M512i)

M512CvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_cvtps_epi32'. Requires AVX512F.

func M512CvtpsEpu32 ¶

func M512CvtpsEpu32(a x86.M512) (dst x86.M512i)

M512CvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_cvtps_epu32'. Requires AVX512F.

func M512CvtpsPd ¶

func M512CvtpsPd(a x86.M256) (dst x86.M512d)

M512CvtpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_cvtps_pd'. Requires AVX512F.

func M512CvtpsPh ¶

func M512CvtpsPh(a x86.M512, rounding int) (dst x86.M256i)

M512CvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_cvtps_ph'. Requires AVX512F.

func M512Cvtsepi32Epi16 ¶

func M512Cvtsepi32Epi16(a x86.M512i) (dst x86.M256i)

M512Cvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm512_cvtsepi32_epi16'. Requires AVX512F.

func M512Cvtsepi32Epi8 ¶

func M512Cvtsepi32Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm512_cvtsepi32_epi8'. Requires AVX512F.

func M512Cvtsepi64Epi16 ¶

func M512Cvtsepi64Epi16(a x86.M512i) (dst x86.M128i)

M512Cvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm512_cvtsepi64_epi16'. Requires AVX512F.

func M512Cvtsepi64Epi32 ¶

func M512Cvtsepi64Epi32(a x86.M512i) (dst x86.M256i)

M512Cvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm512_cvtsepi64_epi32'. Requires AVX512F.

func M512Cvtsepi64Epi8 ¶

func M512Cvtsepi64Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm512_cvtsepi64_epi8'. Requires AVX512F.

func M512CvttRoundpdEpi32 ¶

func M512CvttRoundpdEpi32(a x86.M512d, sae int) (dst x86.M256i)

M512CvttRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 32*i
		k := 64*j
		dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[k+63:k])
	ENDFOR
	dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_cvtt_roundpd_epi32'. Requires AVX512F.

func M512CvttRoundpdEpu32 ¶

func M512CvttRoundpdEpu32(a x86.M512d, sae int) (dst x86.M256i)

M512CvttRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 32*i
		k := 64*j
		dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[k+63:k])
	ENDFOR
	dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_cvtt_roundpd_epu32'. Requires AVX512F.

func M512CvttRoundpsEpi32 ¶

func M512CvttRoundpsEpi32(a x86.M512, sae int) (dst x86.M512i)

M512CvttRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_cvtt_roundps_epi32'. Requires AVX512F.

func M512CvttRoundpsEpu32 ¶

func M512CvttRoundpsEpu32(a x86.M512, sae int) (dst x86.M512i)

M512CvttRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_cvtt_roundps_epu32'. Requires AVX512F.

func M512CvttpdEpi32 ¶

func M512CvttpdEpi32(a x86.M512d) (dst x86.M256i)

M512CvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_cvttpd_epi32'. Requires AVX512F.

func M512CvttpdEpu32 ¶

func M512CvttpdEpu32(a x86.M512d) (dst x86.M256i)

M512CvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_cvttpd_epu32'. Requires AVX512F.

func M512CvttpsEpi32 ¶

func M512CvttpsEpi32(a x86.M512) (dst x86.M512i)

M512CvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_cvttps_epi32'. Requires AVX512F.

func M512CvttpsEpu32 ¶

func M512CvttpsEpu32(a x86.M512) (dst x86.M512i)

M512CvttpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_cvttps_epu32'. Requires AVX512F.

func M512Cvtusepi32Epi16 ¶

func M512Cvtusepi32Epi16(a x86.M512i) (dst x86.M256i)

M512Cvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm512_cvtusepi32_epi16'. Requires AVX512F.

func M512Cvtusepi32Epi8 ¶

func M512Cvtusepi32Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm512_cvtusepi32_epi8'. Requires AVX512F.

func M512Cvtusepi64Epi16 ¶

func M512Cvtusepi64Epi16(a x86.M512i) (dst x86.M128i)

M512Cvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm512_cvtusepi64_epi16'. Requires AVX512F.

func M512Cvtusepi64Epi32 ¶

func M512Cvtusepi64Epi32(a x86.M512i) (dst x86.M256i)

M512Cvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm512_cvtusepi64_epi32'. Requires AVX512F.

func M512Cvtusepi64Epi8 ¶

func M512Cvtusepi64Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm512_cvtusepi64_epi8'. Requires AVX512F.

func M512DivEpi16 ¶

func M512DivEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpi16: Divide packed 16-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 31
	i := 16*j
	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epi16'. Requires AVX512F.

func M512DivEpi32 ¶

func M512DivEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epi32'. Requires AVX512F.

func M512DivEpi64 ¶

func M512DivEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpi64: Divide packed 64-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epi64'. Requires AVX512F.

func M512DivEpi8 ¶

func M512DivEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpi8: Divide packed 8-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 63
	i := 8*j
	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epi8'. Requires AVX512F.

func M512DivEpu16 ¶

func M512DivEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpu16: Divide packed unsigned 16-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 31
	i := 16*j
	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epu16'. Requires AVX512F.

func M512DivEpu32 ¶

func M512DivEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epu32'. Requires AVX512F.

func M512DivEpu64 ¶

func M512DivEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpu64: Divide packed unsigned 64-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epu64'. Requires AVX512F.

func M512DivEpu8 ¶

func M512DivEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpu8: Divide packed unsigned 8-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 63
	i := 8*j
	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epu8'. Requires AVX512F.

func M512DivPd ¶

func M512DivPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512DivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	dst[i+63:i] := a[i+63:i] / b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_div_pd'. Requires AVX512F.

func M512DivPs ¶

func M512DivPs(a x86.M512, b x86.M512) (dst x86.M512)

M512DivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := a[i+31:i] / b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_div_ps'. Requires AVX512F.

func M512DivRoundPd ¶

func M512DivRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512DivRoundPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', =and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 64*j
			dst[i+63:i] := a[i+63:i] / b[i+63:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_div_round_pd'. Requires AVX512F.

func M512DivRoundPs ¶

func M512DivRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512DivRoundPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			dst[i+31:i] := a[i+31:i] / b[i+31:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_div_round_ps'. Requires AVX512F.

func M512ErfPd ¶

func M512ErfPd(a x86.M512d) (dst x86.M512d)

M512ErfPd: Compute the error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ERF(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erf_pd'. Requires AVX512F.

func M512ErfPs ¶

func M512ErfPs(a x86.M512) (dst x86.M512)

M512ErfPs: Compute the error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ERF(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erf_ps'. Requires AVX512F.

func M512ErfcPd ¶

func M512ErfcPd(a x86.M512d) (dst x86.M512d)

M512ErfcPd: Compute the complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfc_pd'. Requires AVX512F.

func M512ErfcPs ¶

func M512ErfcPs(a x86.M512) (dst x86.M512)

M512ErfcPs: Compute the complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := 1.0 - ERF(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfc_ps'. Requires AVX512F.

func M512ErfcinvPd ¶

func M512ErfcinvPd(a x86.M512d) (dst x86.M512d)

M512ErfcinvPd: Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfcinv_pd'. Requires AVX512F.

func M512ErfcinvPs ¶

func M512ErfcinvPs(a x86.M512) (dst x86.M512)

M512ErfcinvPs: Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfcinv_ps'. Requires AVX512F.

func M512ErfinvPd ¶

func M512ErfinvPd(a x86.M512d) (dst x86.M512d)

M512ErfinvPd: Compute the inverse error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfinv_pd'. Requires AVX512F.

func M512ErfinvPs ¶

func M512ErfinvPs(a x86.M512) (dst x86.M512)

M512ErfinvPs: Compute the inverse error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := 1.0 / ERF(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfinv_ps'. Requires AVX512F.

func M512Exp10Pd ¶

func M512Exp10Pd(a x86.M512d) (dst x86.M512d)

M512Exp10Pd: Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := 10^(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp10_pd'. Requires AVX512F.

func M512Exp10Ps ¶

func M512Exp10Ps(a x86.M512) (dst x86.M512)

M512Exp10Ps: Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := 10^(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp10_ps'. Requires AVX512F.

func M512Exp2Pd ¶

func M512Exp2Pd(a x86.M512d) (dst x86.M512d)

M512Exp2Pd: Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := 2^(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp2_pd'. Requires AVX512F.

func M512Exp2Ps ¶

func M512Exp2Ps(a x86.M512) (dst x86.M512)

M512Exp2Ps: Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := 2^(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp2_ps'. Requires AVX512F.

func M512ExpPd ¶

func M512ExpPd(a x86.M512d) (dst x86.M512d)

M512ExpPd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := e^(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp_pd'. Requires AVX512F.

func M512ExpPs ¶

func M512ExpPs(a x86.M512) (dst x86.M512)

M512ExpPs: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp_ps'. Requires AVX512F.

func M512Expm1Pd ¶

func M512Expm1Pd(a x86.M512d) (dst x86.M512d)

M512Expm1Pd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := e^(a[i+63:i]) - 1.0
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_expm1_pd'. Requires AVX512F.

func M512Expm1Ps ¶

func M512Expm1Ps(a x86.M512) (dst x86.M512)

M512Expm1Ps: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := e^(a[i+31:i]) - 1.0
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_expm1_ps'. Requires AVX512F.

func M512Extractf32x4Ps ¶

func M512Extractf32x4Ps(a x86.M512, imm8 byte) (dst x86.M128)

M512Extractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm512_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Extractf64x4Pd ¶

func M512Extractf64x4Pd(a x86.M512d, imm8 byte) (dst x86.M256d)

M512Extractf64x4Pd: Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0

Instruction: 'VEXTRACTF64X4'. Intrinsic: '_mm512_extractf64x4_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Extracti32x4Epi32 ¶

func M512Extracti32x4Epi32(a x86.M512i, imm8 byte) (dst x86.M128i)

M512Extracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm512_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Extracti64x4Epi64 ¶

func M512Extracti64x4Epi64(a x86.M512i, imm8 byte) (dst x86.M256i)

M512Extracti64x4Epi64: Extract 256 bits (composed of 4 packed 64-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0

Instruction: 'VEXTRACTI64X4'. Intrinsic: '_mm512_extracti64x4_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512FixupimmPd ¶

func M512FixupimmPd(a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)

M512FixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512FixupimmPs ¶

func M512FixupimmPs(a x86.M512, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)

M512FixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512FixupimmRoundPd ¶

func M512FixupimmRoundPd(a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)

M512FixupimmRoundPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_fixupimm_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512FixupimmRoundPs ¶

func M512FixupimmRoundPs(a x86.M512, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)

M512FixupimmRoundPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_fixupimm_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512FloorPd ¶

func M512FloorPd(a x86.M512d) (dst x86.M512d)

M512FloorPd: Round the packed double-precision (64-bit) floating-point elements in 'a' down to an integer value, and store the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_floor_pd'. Requires AVX512F.

func M512FloorPs ¶

func M512FloorPs(a x86.M512) (dst x86.M512)

M512FloorPs: Round the packed single-precision (32-bit) floating-point elements in 'a' down to an integer value, and store the results as packed single-precision floating-point elements in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_floor_ps'. Requires AVX512F.

func M512FmaddsubPd ¶

func M512FmaddsubPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF (j is even)
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_fmaddsub_pd'. Requires AVX512F.

func M512FmaddsubPs ¶

func M512FmaddsubPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF (j is even)
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_fmaddsub_ps'. Requires AVX512F.

func M512FmaddsubRoundPd ¶

func M512FmaddsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512FmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF (j is even)
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
			ELSE
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_fmaddsub_round_pd'. Requires AVX512F.

func M512FmaddsubRoundPs ¶

func M512FmaddsubRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512FmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF (j is even)
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
			ELSE
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_fmaddsub_round_ps'. Requires AVX512F.

func M512FmsubaddPd ¶

func M512FmsubaddPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF (j is even)
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_fmsubadd_pd'. Requires AVX512F.

func M512FmsubaddPs ¶

func M512FmsubaddPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF (j is even)
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_fmsubadd_ps'. Requires AVX512F.

func M512FmsubaddRoundPd ¶

func M512FmsubaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512FmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF (j is even)
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
			ELSE
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_fmsubadd_round_pd'. Requires AVX512F.

func M512FmsubaddRoundPs ¶

func M512FmsubaddRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512FmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF (j is even)
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
			ELSE
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_fmsubadd_round_ps'. Requires AVX512F.

func M512HypotPd ¶

func M512HypotPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512HypotPd: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_hypot_pd'. Requires AVX512F.

func M512HypotPs ¶

func M512HypotPs(a x86.M512, b x86.M512) (dst x86.M512)

M512HypotPs: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_hypot_ps'. Requires AVX512F.

func M512Insertf32x4 ¶

func M512Insertf32x4(a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)

M512Insertf32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm512_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Insertf64x4 ¶

func M512Insertf64x4(a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)

M512Insertf64x4: Copy 'a' to 'dst', then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE (imm8[0]) of
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTF64X4'. Intrinsic: '_mm512_insertf64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Inserti32x4 ¶

func M512Inserti32x4(a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512Inserti32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm512_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Inserti64x4 ¶

func M512Inserti64x4(a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512Inserti64x4: Copy 'a' to 'dst', then insert 256 bits (composed of 4 packed 64-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTI64X4'. Intrinsic: '_mm512_inserti64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512InvsqrtPd ¶

func M512InvsqrtPd(a x86.M512d) (dst x86.M512d)

M512InvsqrtPd: Compute the inverse square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := InvSQRT(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_invsqrt_pd'. Requires AVX512F.

func M512InvsqrtPs ¶

func M512InvsqrtPs(a x86.M512) (dst x86.M512)

M512InvsqrtPs: Compute the inverse square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := InvSQRT(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_invsqrt_ps'. Requires AVX512F.

func M512Kand ¶

func M512Kand(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kand: Compute the bitwise AND of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := a[15:0] AND b[15:0]
k[MAX:16] := 0

Instruction: 'KANDW'. Intrinsic: '_mm512_kand'. Requires AVX512F.

func M512Kandn ¶

func M512Kandn(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kandn: Compute the bitwise AND NOT of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := (NOT a[15:0]) AND b[15:0]
k[MAX:16] := 0

Instruction: 'KANDNW'. Intrinsic: '_mm512_kandn'. Requires AVX512F.

func M512Kmov ¶

func M512Kmov(a x86.Mmask16) (dst x86.Mmask16)

M512Kmov: Copy 16-bit mask 'a' to 'k'.

k[15:0] := a[15:0]
k[MAX:16] := 0

Instruction: 'KMOVW'. Intrinsic: '_mm512_kmov'. Requires AVX512F.

func M512Knot ¶

func M512Knot(a x86.Mmask16) (dst x86.Mmask16)

M512Knot: Compute the bitwise NOT of 16-bit mask 'a', and store the result in 'k'.

k[15:0] := NOT a[15:0]
k[MAX:16] := 0

Instruction: 'KNOTW'. Intrinsic: '_mm512_knot'. Requires AVX512F.

func M512Kor ¶

func M512Kor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kor: Compute the bitwise OR of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := a[15:0] OR b[15:0]
k[MAX:16] := 0

Instruction: 'KORW'. Intrinsic: '_mm512_kor'. Requires AVX512F.

func M512Kortestc ¶

func M512Kortestc(k1 x86.Mmask16, k2 x86.Mmask16) int

M512Kortestc: Performs bitwise OR between 'k1' and 'k2', storing the result in 'dst'. CF flag is set if 'dst' consists of all 1's.

dst[15:0] := k1[15:0] | k2[15:0]
IF PopCount(dst[15:0]) = 16
	SetCF()
FI

Instruction: 'KORTESTW'. Intrinsic: '_mm512_kortestc'. Requires AVX512F.

func M512Kortestz ¶

func M512Kortestz(k1 x86.Mmask16, k2 x86.Mmask16) int

M512Kortestz: Performs bitwise OR between 'k1' and 'k2', storing the result in 'dst'. ZF flag is set if 'dst' is 0.

dst[15:0] := k1[15:0] | k2[15:0]
IF dst = 0
	SetZF()
FI

Instruction: 'KORTESTW'. Intrinsic: '_mm512_kortestz'. Requires AVX512F.

func M512Kunpackb ¶

func M512Kunpackb(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kunpackb: Unpack and interleave 8 bits from masks 'a' and 'b', and store the 16-bit result in 'k'.

k[7:0] := b[7:0]
k[15:8] := a[7:0]
k[MAX:16] := 0

Instruction: 'KUNPCKBW'. Intrinsic: '_mm512_kunpackb'. Requires AVX512F.

func M512Kxnor ¶

func M512Kxnor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kxnor: Compute the bitwise XNOR of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := NOT (a[15:0] XOR b[15:0])
k[MAX:16] := 0

Instruction: 'KXNORW'. Intrinsic: '_mm512_kxnor'. Requires AVX512F.

func M512Kxor ¶

func M512Kxor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kxor: Compute the bitwise XOR of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := a[15:0] XOR b[15:0]
k[MAX:16] := 0

Instruction: 'KXORW'. Intrinsic: '_mm512_kxor'. Requires AVX512F.

func M512Log10Pd ¶

func M512Log10Pd(a x86.M512d) (dst x86.M512d)

M512Log10Pd: Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := log10(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log10_pd'. Requires AVX512F.

func M512Log10Ps ¶

func M512Log10Ps(a x86.M512) (dst x86.M512)

M512Log10Ps: Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := log10(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log10_ps'. Requires AVX512F.

func M512Log1pPd ¶

func M512Log1pPd(a x86.M512d) (dst x86.M512d)

M512Log1pPd: Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ln(1.0 + a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log1p_pd'. Requires AVX512F.

func M512Log1pPs ¶

func M512Log1pPs(a x86.M512) (dst x86.M512)

M512Log1pPs: Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ln(1.0 + a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log1p_ps'. Requires AVX512F.

func M512Log2Pd ¶

func M512Log2Pd(a x86.M512d) (dst x86.M512d)

M512Log2Pd: Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := log2(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log2_pd'. Requires AVX512F.

func M512LogPd ¶

func M512LogPd(a x86.M512d) (dst x86.M512d)

M512LogPd: Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ln(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log_pd'. Requires AVX512F.

func M512LogPs ¶

func M512LogPs(a x86.M512) (dst x86.M512)

M512LogPs: Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log_ps'. Requires AVX512F.

func M512LogbPd ¶

func M512LogbPd(a x86.M512d) (dst x86.M512d)

M512LogbPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_logb_pd'. Requires AVX512F.

func M512LogbPs ¶

func M512LogbPs(a x86.M512) (dst x86.M512)

M512LogbPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_logb_ps'. Requires AVX512F.

func M512Mask2Permutex2varEpi32 ¶

func M512Mask2Permutex2varEpi32(a x86.M512i, idx x86.M512i, k x86.Mmask16, b x86.M512i) (dst x86.M512i)

M512Mask2Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2D'. Intrinsic: '_mm512_mask2_permutex2var_epi32'. Requires AVX512F.

func M512Mask2Permutex2varEpi64 ¶

func M512Mask2Permutex2varEpi64(a x86.M512i, idx x86.M512i, k x86.Mmask8, b x86.M512i) (dst x86.M512i)

M512Mask2Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2Q'. Intrinsic: '_mm512_mask2_permutex2var_epi64'. Requires AVX512F.

func M512Mask2Permutex2varPd ¶

func M512Mask2Permutex2varPd(a x86.M512d, idx x86.M512i, k x86.Mmask8, b x86.M512d) (dst x86.M512d)

M512Mask2Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set)

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PD'. Intrinsic: '_mm512_mask2_permutex2var_pd'. Requires AVX512F.

func M512Mask2Permutex2varPs ¶

func M512Mask2Permutex2varPs(a x86.M512, idx x86.M512i, k x86.Mmask16, b x86.M512) (dst x86.M512)

M512Mask2Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PS'. Intrinsic: '_mm512_mask2_permutex2var_ps'. Requires AVX512F.

func M512Mask3FmaddsubPd ¶

func M512Mask3FmaddsubPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)

M512Mask3FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask3_fmaddsub_pd'. Requires AVX512F.

func M512Mask3FmaddsubPs ¶

func M512Mask3FmaddsubPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)

M512Mask3FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask3_fmaddsub_ps'. Requires AVX512F.

func M512Mask3FmaddsubRoundPd ¶

func M512Mask3FmaddsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)

M512Mask3FmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask3_fmaddsub_round_pd'. Requires AVX512F.

func M512Mask3FmaddsubRoundPs ¶

func M512Mask3FmaddsubRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)

M512Mask3FmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask3_fmaddsub_round_ps'. Requires AVX512F.

func M512Mask3FmsubaddPd ¶

func M512Mask3FmsubaddPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)

M512Mask3FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask3_fmsubadd_pd'. Requires AVX512F.

func M512Mask3FmsubaddPs ¶

func M512Mask3FmsubaddPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)

M512Mask3FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask3_fmsubadd_ps'. Requires AVX512F.

func M512Mask3FmsubaddRoundPd ¶

func M512Mask3FmsubaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)

M512Mask3FmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask3_fmsubadd_round_pd'. Requires AVX512F.

func M512Mask3FmsubaddRoundPs ¶

func M512Mask3FmsubaddRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)

M512Mask3FmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask3_fmsubadd_round_ps'. Requires AVX512F.

func M512MaskAbsEpi32 ¶

func M512MaskAbsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm512_mask_abs_epi32'. Requires AVX512F.

func M512MaskAbsEpi64 ¶

func M512MaskAbsEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm512_mask_abs_epi64'. Requires AVX512F.

func M512MaskAcosPd ¶

func M512MaskAcosPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAcosPd: Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ACOS(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_acos_pd'. Requires AVX512F.

func M512MaskAcosPs ¶

func M512MaskAcosPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAcosPs: Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ACOS(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_acos_ps'. Requires AVX512F.

func M512MaskAcoshPd ¶

func M512MaskAcoshPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAcoshPd: Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ACOSH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_acosh_pd'. Requires AVX512F.

func M512MaskAcoshPs ¶

func M512MaskAcoshPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAcoshPs: Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ACOSH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_acosh_ps'. Requires AVX512F.

func M512MaskAddEpi64 ¶

func M512MaskAddEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm512_mask_add_epi64'. Requires AVX512F.

func M512MaskAlignrEpi64 ¶

func M512MaskAlignrEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskAlignrEpi64: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 64-bit elements, and store the low 64 bytes (8 elements) in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (64*count)
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := temp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VALIGNQ'. Intrinsic: '_mm512_mask_alignr_epi64'. Requires AVX512F.

func M512MaskAsinPd ¶

func M512MaskAsinPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAsinPd: Compute the inverse sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ASIN(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_asin_pd'. Requires AVX512F.

func M512MaskAsinPs ¶

func M512MaskAsinPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAsinPs: Compute the inverse sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ASIN(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_asin_ps'. Requires AVX512F.

func M512MaskAsinhPd ¶

func M512MaskAsinhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAsinhPd: Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ASINH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_asinh_pd'. Requires AVX512F.

func M512MaskAsinhPs ¶

func M512MaskAsinhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAsinhPs: Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ASINH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_asinh_ps'. Requires AVX512F.

func M512MaskAtan2Pd ¶

func M512MaskAtan2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskAtan2Pd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to
	i := j*64
	IF k[j]
		dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atan2_pd'. Requires AVX512F.

func M512MaskAtan2Ps ¶

func M512MaskAtan2Ps(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskAtan2Ps: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atan2_ps'. Requires AVX512F.

func M512MaskAtanPd ¶

func M512MaskAtanPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAtanPd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ATAN(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atan_pd'. Requires AVX512F.

func M512MaskAtanPs ¶

func M512MaskAtanPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAtanPs: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ATAN(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atan_ps'. Requires AVX512F.

func M512MaskAtanhPd ¶

func M512MaskAtanhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAtanhPd: Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ATANH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atanh_pd'. Requires AVX512F.

func M512MaskAtanhPs ¶

func M512MaskAtanhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAtanhPs: Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ATANH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atanh_ps'. Requires AVX512F.

func M512MaskBroadcastF32x4 ¶

func M512MaskBroadcastF32x4(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm512_mask_broadcast_f32x4'. Requires AVX512F.

func M512MaskBroadcastF64x4 ¶

func M512MaskBroadcastF64x4(src x86.M512d, k x86.Mmask8, a x86.M256d) (dst x86.M512d)

M512MaskBroadcastF64x4: Broadcast the 4 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X4'. Intrinsic: '_mm512_mask_broadcast_f64x4'. Requires AVX512F.

func M512MaskBroadcastI32x4 ¶

func M512MaskBroadcastI32x4(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm512_mask_broadcast_i32x4'. Requires AVX512F.

func M512MaskBroadcastI64x4 ¶

func M512MaskBroadcastI64x4(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskBroadcastI64x4: Broadcast the 4 packed 64-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X4'. Intrinsic: '_mm512_mask_broadcast_i64x4'. Requires AVX512F.

func M512MaskBroadcastdEpi32 ¶

func M512MaskBroadcastdEpi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_mask_broadcastd_epi32'. Requires AVX512F.

func M512MaskBroadcastqEpi64 ¶

func M512MaskBroadcastqEpi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_mask_broadcastq_epi64'. Requires AVX512F.

func M512MaskBroadcastsdPd ¶

func M512MaskBroadcastsdPd(src x86.M512d, k x86.Mmask8, a x86.M128d) (dst x86.M512d)

M512MaskBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm512_mask_broadcastsd_pd'. Requires AVX512F.

func M512MaskBroadcastssPs ¶

func M512MaskBroadcastssPs(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm512_mask_broadcastss_ps'. Requires AVX512F.

func M512MaskCbrtPd ¶

func M512MaskCbrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCbrtPd: Compute the cube root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := CubeRoot(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cbrt_pd'. Requires AVX512F.

func M512MaskCbrtPs ¶

func M512MaskCbrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCbrtPs: Compute the cube root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := CubeRoot(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cbrt_ps'. Requires AVX512F.

func M512MaskCdfnormPd ¶

func M512MaskCdfnormPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCdfnormPd: Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := CDFNormal(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorm_pd'. Requires AVX512F.

func M512MaskCdfnormPs ¶

func M512MaskCdfnormPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCdfnormPs: Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := CDFNormal(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorm_ps'. Requires AVX512F.

func M512MaskCdfnorminvPd ¶

func M512MaskCdfnorminvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCdfnorminvPd: Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := InverseCDFNormal(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorminv_pd'. Requires AVX512F.

func M512MaskCdfnorminvPs ¶

func M512MaskCdfnorminvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCdfnorminvPs: Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := InverseCDFNormal(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorminv_ps'. Requires AVX512F.

func M512MaskCeilPd ¶

func M512MaskCeilPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCeilPd: Round the packed double-precision (64-bit) floating-point elements in 'a' up to an integer value, and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := CEIL(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_ceil_pd'. Requires AVX512F.

func M512MaskCeilPs ¶

func M512MaskCeilPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCeilPs: Round the packed single-precision (32-bit) floating-point elements in 'a' up to an integer value, and store the results as packed single-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := CEIL(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_ceil_ps'. Requires AVX512F.

func M512MaskCmpEpi64Mask ¶

func M512MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)

M512MaskCmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpEpu64Mask ¶

func M512MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)

M512MaskCmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpeqEpi64Mask ¶

func M512MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPEQQ'. Intrinsic: '_mm512_mask_cmpeq_epi64_mask'. Requires AVX512F.

func M512MaskCmpeqEpu64Mask ¶

func M512MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpeq_epu64_mask'. Requires AVX512F.

func M512MaskCmpgeEpi64Mask ¶

func M512MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmpge_epi64_mask'. Requires AVX512F.

func M512MaskCmpgeEpu64Mask ¶

func M512MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpge_epu64_mask'. Requires AVX512F.

func M512MaskCmpgtEpi64Mask ¶

func M512MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPGTQ'. Intrinsic: '_mm512_mask_cmpgt_epi64_mask'. Requires AVX512F.

func M512MaskCmpgtEpu64Mask ¶

func M512MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpgt_epu64_mask'. Requires AVX512F.

func M512MaskCmpleEpi64Mask ¶

func M512MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmple_epi64_mask'. Requires AVX512F.

func M512MaskCmpleEpu64Mask ¶

func M512MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmple_epu64_mask'. Requires AVX512F.

func M512MaskCmpltEpi32Mask ¶

func M512MaskCmpltEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_mask_cmplt_epi32_mask'. Requires AVX512F.

func M512MaskCmpltEpi64Mask ¶

func M512MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmplt_epi64_mask'. Requires AVX512F.

func M512MaskCmpltEpu64Mask ¶

func M512MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmplt_epu64_mask'. Requires AVX512F.

func M512MaskCmpneqEpi64Mask ¶

func M512MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmpneq_epi64_mask'. Requires AVX512F.

func M512MaskCmpneqEpu64Mask ¶

func M512MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpneq_epu64_mask'. Requires AVX512F.

func M512MaskCompressEpi32 ¶

func M512MaskCompressEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm512_mask_compress_epi32'. Requires AVX512F.

func M512MaskCompressEpi64 ¶

func M512MaskCompressEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm512_mask_compress_epi64'. Requires AVX512F.

func M512MaskCompressPd ¶

func M512MaskCompressPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm512_mask_compress_pd'. Requires AVX512F.

func M512MaskCompressPs ¶

func M512MaskCompressPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm512_mask_compress_ps'. Requires AVX512F.

func M512MaskCosPd ¶

func M512MaskCosPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCosPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := COS(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cos_pd'. Requires AVX512F.

func M512MaskCosPs ¶

func M512MaskCosPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCosPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := COS(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cos_ps'. Requires AVX512F.

func M512MaskCosdPd ¶

func M512MaskCosdPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCosdPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := COSD(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cosd_pd'. Requires AVX512F.

func M512MaskCosdPs ¶

func M512MaskCosdPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCosdPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := COSD(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cosd_ps'. Requires AVX512F.

func M512MaskCoshPd ¶

func M512MaskCoshPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCoshPd: Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := COSH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cosh_pd'. Requires AVX512F.

func M512MaskCoshPs ¶

func M512MaskCoshPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCoshPs: Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := COSH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cosh_ps'. Requires AVX512F.

func M512MaskCvtRoundepi32Ps ¶

func M512MaskCvtRoundepi32Ps(src x86.M512, k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)

M512MaskCvtRoundepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_mask_cvt_roundepi32_ps'. Requires AVX512F.

func M512MaskCvtRoundepu32Ps ¶

func M512MaskCvtRoundepu32Ps(src x86.M512, k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)

M512MaskCvtRoundepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_mask_cvt_roundepu32_ps'. Requires AVX512F.

func M512MaskCvtRoundpdEpi32 ¶

func M512MaskCvtRoundpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)

M512MaskCvtRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*32
			l := j*64
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_mask_cvt_roundpd_epi32'. Requires AVX512F.

func M512MaskCvtRoundpdEpu32 ¶

func M512MaskCvtRoundpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)

M512MaskCvtRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*32
			l := j*64
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_mask_cvt_roundpd_epu32'. Requires AVX512F.

func M512MaskCvtRoundpdPs ¶

func M512MaskCvtRoundpdPs(src x86.M256, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256)

M512MaskCvtRoundpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*32
			l := j*64
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_mask_cvt_roundpd_ps'. Requires AVX512F.

func M512MaskCvtRoundphPs ¶

func M512MaskCvtRoundphPs(src x86.M512, k x86.Mmask16, a x86.M256i, sae int) (dst x86.M512)

M512MaskCvtRoundphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		m := j*16
		IF k[j]
			dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_mask_cvt_roundph_ps'. Requires AVX512F.

func M512MaskCvtRoundpsEpi32 ¶

func M512MaskCvtRoundpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)

M512MaskCvtRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_mask_cvt_roundps_epi32'. Requires AVX512F.

func M512MaskCvtRoundpsEpu32 ¶

func M512MaskCvtRoundpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)

M512MaskCvtRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_mask_cvt_roundps_epu32'. Requires AVX512F.

func M512MaskCvtRoundpsPd ¶

func M512MaskCvtRoundpsPd(src x86.M512d, k x86.Mmask8, a x86.M256, sae int) (dst x86.M512d)

M512MaskCvtRoundpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 64*j
		l := 32*j
		IF k[j]
			dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_mask_cvt_roundps_pd'. Requires AVX512F.

func M512MaskCvtRoundpsPh ¶

func M512MaskCvtRoundpsPh(src x86.M256i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)

M512MaskCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_mask_cvt_roundps_ph'. Requires AVX512F.

func M512MaskCvtepi16Epi32 ¶

func M512MaskCvtepi16Epi32(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	l := j*16
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm512_mask_cvtepi16_epi32'. Requires AVX512F.

func M512MaskCvtepi16Epi64 ¶

func M512MaskCvtepi16Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskCvtepi16Epi64: Sign extend packed 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm512_mask_cvtepi16_epi64'. Requires AVX512F.

func M512MaskCvtepi32Epi16 ¶

func M512MaskCvtepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm512_mask_cvtepi32_epi16'. Requires AVX512F.

func M512MaskCvtepi32Epi64 ¶

func M512MaskCvtepi32Epi64(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm512_mask_cvtepi32_epi64'. Requires AVX512F.

func M512MaskCvtepi32Epi8 ¶

func M512MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm512_mask_cvtepi32_epi8'. Requires AVX512F.

func M512MaskCvtepi32Pd ¶

func M512MaskCvtepi32Pd(src x86.M512d, k x86.Mmask8, a x86.M256i) (dst x86.M512d)

M512MaskCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := src[m+63:m]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_mask_cvtepi32_pd'. Requires AVX512F.

func M512MaskCvtepi32Ps ¶

func M512MaskCvtepi32Ps(src x86.M512, k x86.Mmask16, a x86.M512i) (dst x86.M512)

M512MaskCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_mask_cvtepi32_ps'. Requires AVX512F.

func M512MaskCvtepi64Epi16 ¶

func M512MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm512_mask_cvtepi64_epi16'. Requires AVX512F.

func M512MaskCvtepi64Epi32 ¶

func M512MaskCvtepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm512_mask_cvtepi64_epi32'. Requires AVX512F.

func M512MaskCvtepi64Epi8 ¶

func M512MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm512_mask_cvtepi64_epi8'. Requires AVX512F.

func M512MaskCvtepi8Epi32 ¶

func M512MaskCvtepi8Epi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskCvtepi8Epi32: Sign extend packed 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm512_mask_cvtepi8_epi32'. Requires AVX512F.

func M512MaskCvtepi8Epi64 ¶

func M512MaskCvtepi8Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskCvtepi8Epi64: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm512_mask_cvtepi8_epi64'. Requires AVX512F.

func M512MaskCvtepu16Epi32 ¶

func M512MaskCvtepu16Epi32(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm512_mask_cvtepu16_epi32'. Requires AVX512F.

func M512MaskCvtepu16Epi64 ¶

func M512MaskCvtepu16Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm512_mask_cvtepu16_epi64'. Requires AVX512F.

func M512MaskCvtepu32Epi64 ¶

func M512MaskCvtepu32Epi64(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm512_mask_cvtepu32_epi64'. Requires AVX512F.

func M512MaskCvtepu32Pd ¶

func M512MaskCvtepu32Pd(src x86.M512d, k x86.Mmask8, a x86.M256i) (dst x86.M512d)

M512MaskCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_mask_cvtepu32_pd'. Requires AVX512F.

func M512MaskCvtepu32Ps ¶

func M512MaskCvtepu32Ps(src x86.M512, k x86.Mmask16, a x86.M512i) (dst x86.M512)

M512MaskCvtepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_mask_cvtepu32_ps'. Requires AVX512F.

func M512MaskCvtepu8Epi32 ¶

func M512MaskCvtepu8Epi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm512_mask_cvtepu8_epi32'. Requires AVX512F.

func M512MaskCvtepu8Epi64 ¶

func M512MaskCvtepu8Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm512_mask_cvtepu8_epi64'. Requires AVX512F.

func M512MaskCvtpdEpi32 ¶

func M512MaskCvtpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_mask_cvtpd_epi32'. Requires AVX512F.

func M512MaskCvtpdEpu32 ¶

func M512MaskCvtpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_mask_cvtpd_epu32'. Requires AVX512F.

func M512MaskCvtpdPs ¶

func M512MaskCvtpdPs(src x86.M256, k x86.Mmask8, a x86.M512d) (dst x86.M256)

M512MaskCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_mask_cvtpd_ps'. Requires AVX512F.

func M512MaskCvtphPs ¶

func M512MaskCvtphPs(src x86.M512, k x86.Mmask16, a x86.M256i) (dst x86.M512)

M512MaskCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_mask_cvtph_ps'. Requires AVX512F.

func M512MaskCvtpsEpi32 ¶

func M512MaskCvtpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_mask_cvtps_epi32'. Requires AVX512F.

func M512MaskCvtpsEpu32 ¶

func M512MaskCvtpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_mask_cvtps_epu32'. Requires AVX512F.

func M512MaskCvtpsPd ¶

func M512MaskCvtpsPd(src x86.M512d, k x86.Mmask8, a x86.M256) (dst x86.M512d)

M512MaskCvtpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_mask_cvtps_pd'. Requires AVX512F.

func M512MaskCvtpsPh ¶

func M512MaskCvtpsPh(src x86.M256i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)

M512MaskCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_mask_cvtps_ph'. Requires AVX512F.

func M512MaskCvtsepi32Epi16 ¶

func M512MaskCvtsepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm512_mask_cvtsepi32_epi16'. Requires AVX512F.

func M512MaskCvtsepi32Epi8 ¶

func M512MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm512_mask_cvtsepi32_epi8'. Requires AVX512F.

func M512MaskCvtsepi64Epi16 ¶

func M512MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm512_mask_cvtsepi64_epi16'. Requires AVX512F.

func M512MaskCvtsepi64Epi32 ¶

func M512MaskCvtsepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm512_mask_cvtsepi64_epi32'. Requires AVX512F.

func M512MaskCvtsepi64Epi8 ¶

func M512MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm512_mask_cvtsepi64_epi8'. Requires AVX512F.

func M512MaskCvttRoundpdEpi32 ¶

func M512MaskCvttRoundpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)

M512MaskCvttRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := 32*i
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_mask_cvtt_roundpd_epi32'. Requires AVX512F.

func M512MaskCvttRoundpdEpu32 ¶

func M512MaskCvttRoundpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)

M512MaskCvttRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 32*i
		l := 64*j
		IF k[j]
			dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_mask_cvtt_roundpd_epu32'. Requires AVX512F.

func M512MaskCvttRoundpsEpi32 ¶

func M512MaskCvttRoundpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)

M512MaskCvttRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		IF k[j]
			dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_mask_cvtt_roundps_epi32'. Requires AVX512F.

func M512MaskCvttRoundpsEpu32 ¶

func M512MaskCvttRoundpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)

M512MaskCvttRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		IF k[j]
			dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_mask_cvtt_roundps_epu32'. Requires AVX512F.

func M512MaskCvttpdEpi32 ¶

func M512MaskCvttpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_mask_cvttpd_epi32'. Requires AVX512F.

func M512MaskCvttpdEpu32 ¶

func M512MaskCvttpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_mask_cvttpd_epu32'. Requires AVX512F.

func M512MaskCvttpsEpi32 ¶

func M512MaskCvttpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_mask_cvttps_epi32'. Requires AVX512F.

func M512MaskCvttpsEpu32 ¶

func M512MaskCvttpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_mask_cvttps_epu32'. Requires AVX512F.

func M512MaskCvtusepi32Epi16 ¶

func M512MaskCvtusepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm512_mask_cvtusepi32_epi16'. Requires AVX512F.

func M512MaskCvtusepi32Epi8 ¶

func M512MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm512_mask_cvtusepi32_epi8'. Requires AVX512F.

func M512MaskCvtusepi64Epi16 ¶

func M512MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm512_mask_cvtusepi64_epi16'. Requires AVX512F.

func M512MaskCvtusepi64Epi32 ¶

func M512MaskCvtusepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm512_mask_cvtusepi64_epi32'. Requires AVX512F.

func M512MaskCvtusepi64Epi8 ¶

func M512MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm512_mask_cvtusepi64_epi8'. Requires AVX512F.

func M512MaskDivEpi32 ¶

func M512MaskDivEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskDivEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_div_epi32'. Requires AVX512F.

func M512MaskDivEpu32 ¶

func M512MaskDivEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskDivEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_div_epu32'. Requires AVX512F.

func M512MaskDivPd ¶

func M512MaskDivPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_mask_div_pd'. Requires AVX512F.

func M512MaskDivPs ¶

func M512MaskDivPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_mask_div_ps'. Requires AVX512F.

func M512MaskDivRoundPd ¶

func M512MaskDivRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskDivRoundPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 64*j
			IF k[j]
				dst[i+63:i] := a[i+63:i] / b[i+63:i]
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_mask_div_round_pd'. Requires AVX512F.

func M512MaskDivRoundPs ¶

func M512MaskDivRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskDivRoundPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := a[i+31:i] / b[i+31:i]
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_mask_div_round_ps'. Requires AVX512F.

func M512MaskErfPd ¶

func M512MaskErfPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskErfPd: Compute the error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ERF(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erf_pd'. Requires AVX512F.

func M512MaskErfPs ¶

func M512MaskErfPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskErfPs: Compute the error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ERF(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erf_ps'. Requires AVX512F.

func M512MaskErfcPd ¶

func M512MaskErfcPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskErfcPd: Compute the complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 1.0 - ERF(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfc_pd'. Requires AVX512F.

func M512MaskErfcPs ¶

func M512MaskErfcPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskErfcPs: Compute the complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 1.0 - ERF(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfc_ps'. Requires AVX512F.

func M512MaskErfcinvPd ¶

func M512MaskErfcinvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskErfcinvPd: Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfcinv_pd'. Requires AVX512F.

func M512MaskErfcinvPs ¶

func M512MaskErfcinvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskErfcinvPs: Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfcinv_ps'. Requires AVX512F.

func M512MaskErfinvPd ¶

func M512MaskErfinvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskErfinvPd: Compute the inverse error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 1.0 / ERF(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfinv_pd'. Requires AVX512F.

func M512MaskErfinvPs ¶

func M512MaskErfinvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskErfinvPs: Compute the inverse error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 1.0 / ERF(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfinv_ps'. Requires AVX512F.

func M512MaskExp10Pd ¶

func M512MaskExp10Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskExp10Pd: Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 10^(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp10_pd'. Requires AVX512F.

func M512MaskExp10Ps ¶

func M512MaskExp10Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskExp10Ps: Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 10^(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp10_ps'. Requires AVX512F.

func M512MaskExp2Pd ¶

func M512MaskExp2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskExp2Pd: Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 2^(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp2_pd'. Requires AVX512F.

func M512MaskExp2Ps ¶

func M512MaskExp2Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskExp2Ps: Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 2^(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp2_ps'. Requires AVX512F.

func M512MaskExpPd ¶

func M512MaskExpPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskExpPd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := e^(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp_pd'. Requires AVX512F.

func M512MaskExpPs ¶

func M512MaskExpPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskExpPs: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := e^(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp_ps'. Requires AVX512F.

func M512MaskExpandEpi32 ¶

func M512MaskExpandEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm512_mask_expand_epi32'. Requires AVX512F.

func M512MaskExpandEpi64 ¶

func M512MaskExpandEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm512_mask_expand_epi64'. Requires AVX512F.

func M512MaskExpandPd ¶

func M512MaskExpandPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm512_mask_expand_pd'. Requires AVX512F.

func M512MaskExpandPs ¶

func M512MaskExpandPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm512_mask_expand_ps'. Requires AVX512F.

func M512MaskExpm1Pd ¶

func M512MaskExpm1Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskExpm1Pd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := e^(a[i+63:i]) - 1.0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_expm1_pd'. Requires AVX512F.

func M512MaskExpm1Ps ¶

func M512MaskExpm1Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskExpm1Ps: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := e^(a[i+31:i]) - 1.0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_expm1_ps'. Requires AVX512F.

func M512MaskExtractf32x4Ps ¶

func M512MaskExtractf32x4Ps(src x86.M128, k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M128)

M512MaskExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm512_mask_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskExtractf64x4Pd ¶

func M512MaskExtractf64x4Pd(src x86.M256d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M256d)

M512MaskExtractf64x4Pd: Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTF64X4'. Intrinsic: '_mm512_mask_extractf64x4_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskExtracti32x4Epi32 ¶

func M512MaskExtracti32x4Epi32(src x86.M128i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)

M512MaskExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm512_mask_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskExtracti64x4Epi64 ¶

func M512MaskExtracti64x4Epi64(src x86.M256i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)

M512MaskExtracti64x4Epi64: Extract 256 bits (composed of 4 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTI64X4'. Intrinsic: '_mm512_mask_extracti64x4_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskFixupimmPd ¶

func M512MaskFixupimmPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)

M512MaskFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_mask_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskFixupimmPs ¶

func M512MaskFixupimmPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)

M512MaskFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_mask_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskFixupimmRoundPd ¶

func M512MaskFixupimmRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)

M512MaskFixupimmRoundPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := a[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_mask_fixupimm_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskFixupimmRoundPs ¶

func M512MaskFixupimmRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)

M512MaskFixupimmRoundPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := a[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_mask_fixupimm_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskFloorPd ¶

func M512MaskFloorPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskFloorPd: Round the packed double-precision (64-bit) floating-point elements in 'a' down to an integer value, and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := FLOOR(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_floor_pd'. Requires AVX512F.

func M512MaskFloorPs ¶

func M512MaskFloorPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskFloorPs: Round the packed single-precision (32-bit) floating-point elements in 'a' down to an integer value, and store the results as packed single-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FLOOR(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_floor_ps'. Requires AVX512F.

func M512MaskFmaddsubPd ¶

func M512MaskFmaddsubPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask_fmaddsub_pd'. Requires AVX512F.

func M512MaskFmaddsubPs ¶

func M512MaskFmaddsubPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask_fmaddsub_ps'. Requires AVX512F.

func M512MaskFmaddsubRoundPd ¶

func M512MaskFmaddsubRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskFmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := a[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask_fmaddsub_round_pd'. Requires AVX512F.

func M512MaskFmaddsubRoundPs ¶

func M512MaskFmaddsubRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskFmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := a[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask_fmaddsub_round_ps'. Requires AVX512F.

func M512MaskFmsubaddPd ¶

func M512MaskFmsubaddPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask_fmsubadd_pd'. Requires AVX512F.

func M512MaskFmsubaddPs ¶

func M512MaskFmsubaddPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask_fmsubadd_ps'. Requires AVX512F.

func M512MaskFmsubaddRoundPd ¶

func M512MaskFmsubaddRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskFmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := a[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask_fmsubadd_round_pd'. Requires AVX512F.

func M512MaskFmsubaddRoundPs ¶

func M512MaskFmsubaddRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskFmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := a[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask_fmsubadd_round_ps'. Requires AVX512F.

func M512MaskHypotPd ¶

func M512MaskHypotPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskHypotPd: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_hypot_pd'. Requires AVX512F.

func M512MaskHypotPs ¶

func M512MaskHypotPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskHypotPs: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_hypot_ps'. Requires AVX512F.

func M512MaskInsertf32x4 ¶

func M512MaskInsertf32x4(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)

M512MaskInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm512_mask_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskInsertf64x4 ¶

func M512MaskInsertf64x4(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)

M512MaskInsertf64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF64X4'. Intrinsic: '_mm512_mask_insertf64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskInserti32x4 ¶

func M512MaskInserti32x4(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512MaskInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm512_mask_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskInserti64x4 ¶

func M512MaskInserti64x4(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512MaskInserti64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI64X4'. Intrinsic: '_mm512_mask_inserti64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskInvsqrtPd ¶

func M512MaskInvsqrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskInvsqrtPd: Compute the inverse square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := InvSQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_invsqrt_pd'. Requires AVX512F.

func M512MaskInvsqrtPs ¶

func M512MaskInvsqrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskInvsqrtPs: Compute the inverse square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := InvSQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_invsqrt_ps'. Requires AVX512F.

func M512MaskLog10Pd ¶

func M512MaskLog10Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskLog10Pd: Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := log10(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log10_pd'. Requires AVX512F.

func M512MaskLog10Ps ¶

func M512MaskLog10Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskLog10Ps: Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := log10(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log10_ps'. Requires AVX512F.

func M512MaskLog1pPd ¶

func M512MaskLog1pPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskLog1pPd: Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ln(1.0 + a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log1p_pd'. Requires AVX512F.

func M512MaskLog1pPs ¶

func M512MaskLog1pPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskLog1pPs: Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ln(1.0 + a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log1p_ps'. Requires AVX512F.

func M512MaskLog2Pd ¶

func M512MaskLog2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskLog2Pd: Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := log2(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log2_pd'. Requires AVX512F.

func M512MaskLogPd ¶

func M512MaskLogPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskLogPd: Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ln(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log_pd'. Requires AVX512F.

func M512MaskLogPs ¶

func M512MaskLogPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskLogPs: Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ln(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log_ps'. Requires AVX512F.

func M512MaskLogbPd ¶

func M512MaskLogbPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskLogbPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_logb_pd'. Requires AVX512F.

func M512MaskLogbPs ¶

func M512MaskLogbPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskLogbPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_logb_ps'. Requires AVX512F.

func M512MaskMaxEpi64 ¶

func M512MaskMaxEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm512_mask_max_epi64'. Requires AVX512F.

func M512MaskMaxEpu64 ¶

func M512MaskMaxEpu64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm512_mask_max_epu64'. Requires AVX512F.

func M512MaskMaxPd ¶

func M512MaskMaxPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_mask_max_pd'. Requires AVX512F.

func M512MaskMaxPs ¶

func M512MaskMaxPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_mask_max_ps'. Requires AVX512F.

func M512MaskMaxRoundPd ¶

func M512MaskMaxRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MaskMaxRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_mask_max_round_pd'. Requires AVX512F.

func M512MaskMaxRoundPs ¶

func M512MaskMaxRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MaskMaxRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		IF k[j]
			dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_mask_max_round_ps'. Requires AVX512F.

func M512MaskMinEpi64 ¶

func M512MaskMinEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm512_mask_min_epi64'. Requires AVX512F.

func M512MaskMinEpu64 ¶

func M512MaskMinEpu64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm512_mask_min_epu64'. Requires AVX512F.

func M512MaskMinPd ¶

func M512MaskMinPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_mask_min_pd'. Requires AVX512F.

func M512MaskMinPs ¶

func M512MaskMinPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_mask_min_ps'. Requires AVX512F.

func M512MaskMinRoundPd ¶

func M512MaskMinRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MaskMinRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_mask_min_round_pd'. Requires AVX512F.

func M512MaskMinRoundPs ¶

func M512MaskMinRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MaskMinRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		IF k[j]
			dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_mask_min_round_ps'. Requires AVX512F.

func M512MaskMovedupPd ¶

func M512MaskMovedupPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
tmp[319:256] := a[319:256]
tmp[383:320] := a[319:256]
tmp[447:384] := a[447:384]
tmp[511:448] := a[447:384]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm512_mask_movedup_pd'. Requires AVX512F.

func M512MaskMovehdupPs ¶

func M512MaskMovehdupPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
tmp[287:256] := a[319:288]
tmp[319:288] := a[319:288]
tmp[351:320] := a[383:352]
tmp[383:352] := a[383:352]
tmp[415:384] := a[447:416]
tmp[447:416] := a[447:416]
tmp[479:448] := a[511:480]
tmp[511:480] := a[511:480]
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm512_mask_movehdup_ps'. Requires AVX512F.

func M512MaskMoveldupPs ¶

func M512MaskMoveldupPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
tmp[287:256] := a[287:256]
tmp[319:288] := a[287:256]
tmp[351:320] := a[351:320]
tmp[383:352] := a[351:320]
tmp[415:384] := a[415:384]
tmp[447:416] := a[415:384]
tmp[479:448] := a[479:448]
tmp[511:480] := a[479:448]
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm512_mask_moveldup_ps'. Requires AVX512F.

func M512MaskMulEpi32 ¶

func M512MaskMulEpi32(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm512_mask_mul_epi32'. Requires AVX512F.

func M512MaskMulEpu32 ¶

func M512MaskMulEpu32(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm512_mask_mul_epu32'. Requires AVX512F.

func M512MaskMulloxEpi64 ¶

func M512MaskMulloxEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulloxEpi64: Multiplies elements in packed 64-bit integer vectors 'a' and 'b' together, storing the lower 64 bits of the result in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_mullox_epi64'. Requires AVX512F.

func M512MaskNearbyintPd ¶

func M512MaskNearbyintPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskNearbyintPd: Rounds each packed double-precision (64-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := NearbyInt(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_nearbyint_pd'. Requires AVX512F.

func M512MaskNearbyintPs ¶

func M512MaskNearbyintPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskNearbyintPs: Rounds each packed single-precision (32-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := NearbyInt(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_nearbyint_ps'. Requires AVX512F.

func M512MaskPermutePd ¶

func M512MaskPermutePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]
IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]
IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]
IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]
IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]
IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]
IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]
IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_mask_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskPermutePs ¶

func M512MaskPermutePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_mask_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskPermutevarPd ¶

func M512MaskPermutevarPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512i) (dst x86.M512d)

M512MaskPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
IF (b[257] == 0) tmp_dst[319:256] := a[319:256]
IF (b[257] == 1) tmp_dst[319:256] := a[383:320]
IF (b[321] == 0) tmp_dst[383:320] := a[319:256]
IF (b[321] == 1) tmp_dst[383:320] := a[383:320]
IF (b[385] == 0) tmp_dst[447:384] := a[447:384]
IF (b[385] == 1) tmp_dst[447:384] := a[511:448]
IF (b[449] == 0) tmp_dst[511:448] := a[447:384]
IF (b[449] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_mask_permutevar_pd'. Requires AVX512F.

func M512MaskPermutevarPs ¶

func M512MaskPermutevarPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512i) (dst x86.M512)

M512MaskPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_mask_permutevar_ps'. Requires AVX512F.

func M512MaskPermutex2varEpi32 ¶

func M512MaskPermutex2varEpi32(a x86.M512i, k x86.Mmask16, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMT2D'. Intrinsic: '_mm512_mask_permutex2var_epi32'. Requires AVX512F.

func M512MaskPermutex2varEpi64 ¶

func M512MaskPermutex2varEpi64(a x86.M512i, k x86.Mmask8, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMT2Q'. Intrinsic: '_mm512_mask_permutex2var_epi64'. Requires AVX512F.

func M512MaskPermutex2varPd ¶

func M512MaskPermutex2varPd(a x86.M512d, k x86.Mmask8, idx x86.M512i, b x86.M512d) (dst x86.M512d)

M512MaskPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMT2PD'. Intrinsic: '_mm512_mask_permutex2var_pd'. Requires AVX512F.

func M512MaskPermutex2varPs ¶

func M512MaskPermutex2varPs(a x86.M512, k x86.Mmask16, idx x86.M512i, b x86.M512) (dst x86.M512)

M512MaskPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMT2PS'. Intrinsic: '_mm512_mask_permutex2var_ps'. Requires AVX512F.

func M512MaskPermutexEpi64 ¶

func M512MaskPermutexEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskPermutexEpi64: Shuffle 64-bit integers in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_mask_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskPermutexPd ¶

func M512MaskPermutexPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_mask_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskPermutexvarEpi32 ¶

func M512MaskPermutexvarEpi32(src x86.M512i, k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm512_mask_permutexvar_epi32'. Requires AVX512F.

func M512MaskPermutexvarEpi64 ¶

func M512MaskPermutexvarEpi64(src x86.M512i, k x86.Mmask8, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_mask_permutexvar_epi64'. Requires AVX512F.

func M512MaskPermutexvarPd ¶

func M512MaskPermutexvarPd(src x86.M512d, k x86.Mmask8, idx x86.M512i, a x86.M512d) (dst x86.M512d)

M512MaskPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_mask_permutexvar_pd'. Requires AVX512F.

func M512MaskPermutexvarPs ¶

func M512MaskPermutexvarPs(src x86.M512, k x86.Mmask16, idx x86.M512i, a x86.M512) (dst x86.M512)

M512MaskPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm512_mask_permutexvar_ps'. Requires AVX512F.

func M512MaskPowPd ¶

func M512MaskPowPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskPowPd: Compute the exponential value of packed double-precision (64-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_pow_pd'. Requires AVX512F.

func M512MaskPowPs ¶

func M512MaskPowPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskPowPs: Compute the exponential value of packed single-precision (32-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_pow_ps'. Requires AVX512F.

func M512MaskRcp14Pd ¶

func M512MaskRcp14Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm512_mask_rcp14_pd'. Requires AVX512F.

func M512MaskRcp14Ps ¶

func M512MaskRcp14Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm512_mask_rcp14_ps'. Requires AVX512F.

func M512MaskRecipPd ¶

func M512MaskRecipPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskRecipPd: Computes the reciprocal of packed double-precision (64-bit) floating-point elements in 'a', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (1 / a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_recip_pd'. Requires AVX512F.

func M512MaskRecipPs ¶

func M512MaskRecipPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskRecipPs: Computes the reciprocal of packed single-precision (32-bit) floating-point elements in 'a', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (1 / a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_recip_ps'. Requires AVX512F.

func M512MaskRemEpi32 ¶

func M512MaskRemEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRemEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_rem_epi32'. Requires AVX512F.

func M512MaskRemEpu32 ¶

func M512MaskRemEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRemEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_rem_epu32'. Requires AVX512F.

func M512MaskRintPd ¶

func M512MaskRintPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskRintPd: Rounds the packed double-precision (64-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundToNearestEven(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_rint_pd'. Requires AVX512F.

func M512MaskRintPs ¶

func M512MaskRintPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskRintPs: Rounds the packed single-precision (32-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundToNearestEven(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_rint_ps'. Requires AVX512F.

func M512MaskRolEpi32 ¶

func M512MaskRolEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm512_mask_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRolEpi64 ¶

func M512MaskRolEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm512_mask_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRolvEpi32 ¶

func M512MaskRolvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm512_mask_rolv_epi32'. Requires AVX512F.

func M512MaskRolvEpi64 ¶

func M512MaskRolvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm512_mask_rolv_epi64'. Requires AVX512F.

func M512MaskRorEpi32 ¶

func M512MaskRorEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm512_mask_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRorEpi64 ¶

func M512MaskRorEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm512_mask_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRorvEpi32 ¶

func M512MaskRorvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm512_mask_rorv_epi32'. Requires AVX512F.

func M512MaskRorvEpi64 ¶

func M512MaskRorvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm512_mask_rorv_epi64'. Requires AVX512F.

func M512MaskRoundscalePd ¶

func M512MaskRoundscalePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_mask_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRoundscalePs ¶

func M512MaskRoundscalePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_mask_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRoundscaleRoundPd ¶

func M512MaskRoundscaleRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskRoundscaleRoundPd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_mask_roundscale_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRoundscaleRoundPs ¶

func M512MaskRoundscaleRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskRoundscaleRoundPs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_mask_roundscale_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRsqrt14Pd ¶

func M512MaskRsqrt14Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm512_mask_rsqrt14_pd'. Requires AVX512F.

func M512MaskRsqrt14Ps ¶

func M512MaskRsqrt14Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm512_mask_rsqrt14_ps'. Requires AVX512F.

func M512MaskScalefPd ¶

func M512MaskScalefPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_mask_scalef_pd'. Requires AVX512F.

func M512MaskScalefPs ¶

func M512MaskScalefPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_mask_scalef_ps'. Requires AVX512F.

func M512MaskScalefRoundPd ¶

func M512MaskScalefRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskScalefRoundPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_mask_scalef_round_pd'. Requires AVX512F.

func M512MaskScalefRoundPs ¶

func M512MaskScalefRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskScalefRoundPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_mask_scalef_round_ps'. Requires AVX512F.

func M512MaskSet1Epi32 ¶

func M512MaskSet1Epi32(src x86.M512i, k x86.Mmask16, a int) (dst x86.M512i)

M512MaskSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_mask_set1_epi32'. Requires AVX512F.

func M512MaskSet1Epi64 ¶

func M512MaskSet1Epi64(src x86.M512i, k x86.Mmask8, a int64) (dst x86.M512i)

M512MaskSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_mask_set1_epi64'. Requires AVX512F.

func M512MaskShuffleF32x4 ¶

func M512MaskShuffleF32x4(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm512_mask_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskShuffleF64x2 ¶

func M512MaskShuffleF64x2(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm512_mask_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskShuffleI32x4 ¶

func M512MaskShuffleI32x4(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm512_mask_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskShuffleI64x2 ¶

func M512MaskShuffleI64x2(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm512_mask_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskShufflePd ¶

func M512MaskShufflePd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm512_mask_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskShufflePs ¶

func M512MaskShufflePs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm512_mask_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskSinPd ¶

func M512MaskSinPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSinPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SIN(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sin_pd'. Requires AVX512F.

func M512MaskSinPs ¶

func M512MaskSinPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskSinPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SIN(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sin_ps'. Requires AVX512F.

func M512MaskSincosPd ¶

func M512MaskSincosPd(cos_res *x86.M512d, sin_src x86.M512d, cos_src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSincosPd: Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'. Elements are written to their respective locations using writemask 'k' (elements are copied from 'sin_src' or 'cos_src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SIN(a[i+63:i])
		cos_res[i+63:i] := COS(a[i+63:i])
	ELSE
		dst[i+63:i] := sin_src[i+63:i]
		cos_res[i+63:i] := cos_src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sincos_pd'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskSincosPs ¶

func M512MaskSincosPs(cos_res *x86.M512, sin_src x86.M512, cos_src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskSincosPs: Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'. Elements are written to their respective locations using writemask 'k' (elements are copied from 'sin_src' or 'cos_src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SIN(a[i+31:i])
		cos_res[i+31:i] := COS(a[i+31:i])
	ELSE
		dst[i+31:i] := sin_src[i+31:i]
		cos_res[i+31:i] := cos_src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sincos_ps'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskSindPd ¶

func M512MaskSindPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSindPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SIND(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sind_pd'. Requires AVX512F.

func M512MaskSindPs ¶

func M512MaskSindPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskSindPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SIND(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sind_ps'. Requires AVX512F.

func M512MaskSinhPd ¶

func M512MaskSinhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSinhPd: Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SINH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sinh_pd'. Requires AVX512F.

func M512MaskSinhPs ¶

func M512MaskSinhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskSinhPs: Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SINH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sinh_ps'. Requires AVX512F.

func M512MaskSllEpi32 ¶

func M512MaskSllEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm512_mask_sll_epi32'. Requires AVX512F.

func M512MaskSllEpi64 ¶

func M512MaskSllEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_mask_sll_epi64'. Requires AVX512F.

func M512MaskSlliEpi64 ¶

func M512MaskSlliEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_mask_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskSllvEpi64 ¶

func M512MaskSllvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm512_mask_sllv_epi64'. Requires AVX512F.

func M512MaskSqrtPd ¶

func M512MaskSqrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_mask_sqrt_pd'. Requires AVX512F.

func M512MaskSqrtPs ¶

func M512MaskSqrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_mask_sqrt_ps'. Requires AVX512F.

func M512MaskSqrtRoundPd ¶

func M512MaskSqrtRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)

M512MaskSqrtRoundPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := SQRT(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_mask_sqrt_round_pd'. Requires AVX512F.

func M512MaskSqrtRoundPs ¶

func M512MaskSqrtRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)

M512MaskSqrtRoundPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := SQRT(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_mask_sqrt_round_ps'. Requires AVX512F.

func M512MaskSraEpi32 ¶

func M512MaskSraEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm512_mask_sra_epi32'. Requires AVX512F.

func M512MaskSraEpi64 ¶

func M512MaskSraEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_mask_sra_epi64'. Requires AVX512F.

func M512MaskSraiEpi64 ¶

func M512MaskSraiEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_mask_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskSravEpi64 ¶

func M512MaskSravEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm512_mask_srav_epi64'. Requires AVX512F.

func M512MaskSrlEpi32 ¶

func M512MaskSrlEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm512_mask_srl_epi32'. Requires AVX512F.

func M512MaskSrlEpi64 ¶

func M512MaskSrlEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_mask_srl_epi64'. Requires AVX512F.

func M512MaskSrliEpi64 ¶

func M512MaskSrliEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_mask_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskSrlvEpi64 ¶

func M512MaskSrlvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm512_mask_srlv_epi64'. Requires AVX512F.

func M512MaskSubEpi64 ¶

func M512MaskSubEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm512_mask_sub_epi64'. Requires AVX512F.

func M512MaskSvmlRoundPd ¶

func M512MaskSvmlRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSvmlRoundPd: Round the packed double-precision (64-bit) floating-point elements in 'a' to the nearest integer value, and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ROUND(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_svml_round_pd'. Requires AVX512F.

func M512MaskTanPd ¶

func M512MaskTanPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskTanPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := TAN(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tan_pd'. Requires AVX512F.

func M512MaskTanPs ¶

func M512MaskTanPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskTanPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := TAN(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tan_ps'. Requires AVX512F.

func M512MaskTandPd ¶

func M512MaskTandPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskTandPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := TAND(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tand_pd'. Requires AVX512F.

func M512MaskTandPs ¶

func M512MaskTandPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskTandPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := TAND(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tand_ps'. Requires AVX512F.

func M512MaskTanhPd ¶

func M512MaskTanhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskTanhPd: Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := TANH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tanh_pd'. Requires AVX512F.

func M512MaskTanhPs ¶

func M512MaskTanhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskTanhPs: Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := TANH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tanh_ps'. Requires AVX512F.

func M512MaskTernarylogicEpi32 ¶

func M512MaskTernarylogicEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 32-bit granularity (32-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm512_mask_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskTernarylogicEpi64 ¶

func M512MaskTernarylogicEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 64-bit granularity (64-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm512_mask_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskTestEpi64Mask ¶

func M512MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskTestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm512_mask_test_epi64_mask'. Requires AVX512F.

func M512MaskTestnEpi32Mask ¶

func M512MaskTestnEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskTestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm512_mask_testn_epi32_mask'. Requires AVX512F.

func M512MaskTestnEpi64Mask ¶

func M512MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskTestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm512_mask_testn_epi64_mask'. Requires AVX512F.

func M512MaskTruncPd ¶

func M512MaskTruncPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskTruncPd: Truncate the packed double-precision (64-bit) floating-point elements in 'a', and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := TRUNCATE(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_trunc_pd'. Requires AVX512F.

func M512MaskTruncPs ¶

func M512MaskTruncPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskTruncPs: Truncate the packed single-precision (32-bit) floating-point elements in 'a', and store the results as packed single-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := TRUNCATE(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_trunc_ps'. Requires AVX512F.

func M512MaskUnpackhiEpi32 ¶

func M512MaskUnpackhiEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm512_mask_unpackhi_epi32'. Requires AVX512F.

func M512MaskUnpackhiEpi64 ¶

func M512MaskUnpackhiEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm512_mask_unpackhi_epi64'. Requires AVX512F.

func M512MaskUnpackhiPd ¶

func M512MaskUnpackhiPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm512_mask_unpackhi_pd'. Requires AVX512F.

func M512MaskUnpackhiPs ¶

func M512MaskUnpackhiPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm512_mask_unpackhi_ps'. Requires AVX512F.

func M512MaskUnpackloEpi32 ¶

func M512MaskUnpackloEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm512_mask_unpacklo_epi32'. Requires AVX512F.

func M512MaskUnpackloEpi64 ¶

func M512MaskUnpackloEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm512_mask_unpacklo_epi64'. Requires AVX512F.

func M512MaskUnpackloPd ¶

func M512MaskUnpackloPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm512_mask_unpacklo_pd'. Requires AVX512F.

func M512MaskUnpackloPs ¶

func M512MaskUnpackloPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm512_mask_unpacklo_ps'. Requires AVX512F.

func M512MaskzAbsEpi32 ¶

func M512MaskzAbsEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskzAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm512_maskz_abs_epi32'. Requires AVX512F.

func M512MaskzAbsEpi64 ¶

func M512MaskzAbsEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskzAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm512_maskz_abs_epi64'. Requires AVX512F.

func M512MaskzAddEpi32 ¶

func M512MaskzAddEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm512_maskz_add_epi32'. Requires AVX512F.

func M512MaskzAddEpi64 ¶

func M512MaskzAddEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm512_maskz_add_epi64'. Requires AVX512F.

func M512MaskzAddPd ¶

func M512MaskzAddPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzAddPd: Add packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDPD'. Intrinsic: '_mm512_maskz_add_pd'. Requires AVX512F.

func M512MaskzAddPs ¶

func M512MaskzAddPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzAddPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDPS'. Intrinsic: '_mm512_maskz_add_ps'. Requires AVX512F.

func M512MaskzAddRoundPd ¶

func M512MaskzAddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskzAddRoundPd: Add packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := a[i+63:i] + b[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDPD'. Intrinsic: '_mm512_maskz_add_round_pd'. Requires AVX512F.

func M512MaskzAddRoundPs ¶

func M512MaskzAddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskzAddRoundPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := a[i+31:i] + b[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDPS'. Intrinsic: '_mm512_maskz_add_round_ps'. Requires AVX512F.

func M512MaskzAlignrEpi32 ¶

func M512MaskzAlignrEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskzAlignrEpi32: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 32-bit elements, and stores the low 64 bytes (16 elements) in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (32*count)
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := temp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VALIGND'. Intrinsic: '_mm512_maskz_alignr_epi32'. Requires AVX512F.

func M512MaskzAlignrEpi64 ¶

func M512MaskzAlignrEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskzAlignrEpi64: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 64-bit elements, and stores the low 64 bytes (8 elements) in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (64*count)
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := temp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VALIGNQ'. Intrinsic: '_mm512_maskz_alignr_epi64'. Requires AVX512F.

func M512MaskzAndEpi32 ¶

func M512MaskzAndEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm512_maskz_and_epi32'. Requires AVX512F.

func M512MaskzAndEpi64 ¶

func M512MaskzAndEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm512_maskz_and_epi64'. Requires AVX512F.

func M512MaskzAndnotEpi32 ¶

func M512MaskzAndnotEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm512_maskz_andnot_epi32'. Requires AVX512F.

func M512MaskzAndnotEpi64 ¶

func M512MaskzAndnotEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm512_maskz_andnot_epi64'. Requires AVX512F.

func M512MaskzBroadcastF32x4 ¶

func M512MaskzBroadcastF32x4(k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskzBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm512_maskz_broadcast_f32x4'. Requires AVX512F.

func M512MaskzBroadcastF64x4 ¶

func M512MaskzBroadcastF64x4(k x86.Mmask8, a x86.M256d) (dst x86.M512d)

M512MaskzBroadcastF64x4: Broadcast the 4 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X4'. Intrinsic: '_mm512_maskz_broadcast_f64x4'. Requires AVX512F.

func M512MaskzBroadcastI32x4 ¶

func M512MaskzBroadcastI32x4(k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm512_maskz_broadcast_i32x4'. Requires AVX512F.

func M512MaskzBroadcastI64x4 ¶

func M512MaskzBroadcastI64x4(k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskzBroadcastI64x4: Broadcast the 4 packed 64-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X4'. Intrinsic: '_mm512_maskz_broadcast_i64x4'. Requires AVX512F.

func M512MaskzBroadcastdEpi32 ¶

func M512MaskzBroadcastdEpi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_maskz_broadcastd_epi32'. Requires AVX512F.

func M512MaskzBroadcastqEpi64 ¶

func M512MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_maskz_broadcastq_epi64'. Requires AVX512F.

func M512MaskzBroadcastsdPd ¶

func M512MaskzBroadcastsdPd(k x86.Mmask8, a x86.M128d) (dst x86.M512d)

M512MaskzBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm512_maskz_broadcastsd_pd'. Requires AVX512F.

func M512MaskzBroadcastssPs ¶

func M512MaskzBroadcastssPs(k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskzBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm512_maskz_broadcastss_ps'. Requires AVX512F.

func M512MaskzCompressEpi32 ¶

func M512MaskzCompressEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskzCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm512_maskz_compress_epi32'. Requires AVX512F.

func M512MaskzCompressEpi64 ¶

func M512MaskzCompressEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskzCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm512_maskz_compress_epi64'. Requires AVX512F.

func M512MaskzCompressPd ¶

func M512MaskzCompressPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm512_maskz_compress_pd'. Requires AVX512F.

func M512MaskzCompressPs ¶

func M512MaskzCompressPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm512_maskz_compress_ps'. Requires AVX512F.

func M512MaskzCvtRoundepi32Ps ¶

func M512MaskzCvtRoundepi32Ps(k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)

M512MaskzCvtRoundepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_maskz_cvt_roundepi32_ps'. Requires AVX512F.

func M512MaskzCvtRoundepu32Ps ¶

func M512MaskzCvtRoundepu32Ps(k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)

M512MaskzCvtRoundepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_maskz_cvt_roundepu32_ps'. Requires AVX512F.

func M512MaskzCvtRoundpdEpi32 ¶

func M512MaskzCvtRoundpdEpi32(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)

M512MaskzCvtRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 32*j
			l := 64*j
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_maskz_cvt_roundpd_epi32'. Requires AVX512F.

func M512MaskzCvtRoundpdEpu32 ¶

func M512MaskzCvtRoundpdEpu32(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)

M512MaskzCvtRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 32*j
			l := 64*j
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_maskz_cvt_roundpd_epu32'. Requires AVX512F.

func M512MaskzCvtRoundpdPs ¶

func M512MaskzCvtRoundpdPs(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256)

M512MaskzCvtRoundpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*32
			l := j*64
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_maskz_cvt_roundpd_ps'. Requires AVX512F.

func M512MaskzCvtRoundphPs ¶

func M512MaskzCvtRoundphPs(k x86.Mmask16, a x86.M256i, sae int) (dst x86.M512)

M512MaskzCvtRoundphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		m := j*16
		IF k[j]
			dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_maskz_cvt_roundph_ps'. Requires AVX512F.

func M512MaskzCvtRoundpsEpi32 ¶

func M512MaskzCvtRoundpsEpi32(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_maskz_cvt_roundps_epi32'. Requires AVX512F.

func M512MaskzCvtRoundpsEpu32 ¶

func M512MaskzCvtRoundpsEpu32(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_maskz_cvt_roundps_epu32'. Requires AVX512F.

func M512MaskzCvtRoundpsPd ¶

func M512MaskzCvtRoundpsPd(k x86.Mmask8, a x86.M256, sae int) (dst x86.M512d)

M512MaskzCvtRoundpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 64*j
		l := 32*j
		IF k[j]
			dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
		ELSE
			dst[i+63:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_maskz_cvt_roundps_pd'. Requires AVX512F.

func M512MaskzCvtRoundpsPh ¶

func M512MaskzCvtRoundpsPh(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)

M512MaskzCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_maskz_cvt_roundps_ph'. Requires AVX512F.

func M512MaskzCvtepi16Epi32 ¶

func M512MaskzCvtepi16Epi32(k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm512_maskz_cvtepi16_epi32'. Requires AVX512F.

func M512MaskzCvtepi16Epi64 ¶

func M512MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepi16Epi64: Sign extend packed 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm512_maskz_cvtepi16_epi64'. Requires AVX512F.

func M512MaskzCvtepi32Epi16 ¶

func M512MaskzCvtepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskzCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm512_maskz_cvtepi32_epi16'. Requires AVX512F.

func M512MaskzCvtepi32Epi64 ¶

func M512MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm512_maskz_cvtepi32_epi64'. Requires AVX512F.

func M512MaskzCvtepi32Epi8 ¶

func M512MaskzCvtepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskzCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm512_maskz_cvtepi32_epi8'. Requires AVX512F.

func M512MaskzCvtepi32Pd ¶

func M512MaskzCvtepi32Pd(k x86.Mmask8, a x86.M256i) (dst x86.M512d)

M512MaskzCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_maskz_cvtepi32_pd'. Requires AVX512F.

func M512MaskzCvtepi32Ps ¶

func M512MaskzCvtepi32Ps(k x86.Mmask16, a x86.M512i) (dst x86.M512)

M512MaskzCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_maskz_cvtepi32_ps'. Requires AVX512F.

func M512MaskzCvtepi64Epi16 ¶

func M512MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm512_maskz_cvtepi64_epi16'. Requires AVX512F.

func M512MaskzCvtepi64Epi32 ¶

func M512MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskzCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm512_maskz_cvtepi64_epi32'. Requires AVX512F.

func M512MaskzCvtepi64Epi8 ¶

func M512MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm512_maskz_cvtepi64_epi8'. Requires AVX512F.

func M512MaskzCvtepi8Epi32 ¶

func M512MaskzCvtepi8Epi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepi8Epi32: Sign extend packed 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm512_maskz_cvtepi8_epi32'. Requires AVX512F.

func M512MaskzCvtepi8Epi64 ¶

func M512MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepi8Epi64: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm512_maskz_cvtepi8_epi64'. Requires AVX512F.

func M512MaskzCvtepu16Epi32 ¶

func M512MaskzCvtepu16Epi32(k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm512_maskz_cvtepu16_epi32'. Requires AVX512F.

func M512MaskzCvtepu16Epi64 ¶

func M512MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm512_maskz_cvtepu16_epi64'. Requires AVX512F.

func M512MaskzCvtepu32Epi64 ¶

func M512MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm512_maskz_cvtepu32_epi64'. Requires AVX512F.

func M512MaskzCvtepu32Pd ¶

func M512MaskzCvtepu32Pd(k x86.Mmask8, a x86.M256i) (dst x86.M512d)

M512MaskzCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_maskz_cvtepu32_pd'. Requires AVX512F.

func M512MaskzCvtepu32Ps ¶

func M512MaskzCvtepu32Ps(k x86.Mmask16, a x86.M512i) (dst x86.M512)

M512MaskzCvtepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_maskz_cvtepu32_ps'. Requires AVX512F.

func M512MaskzCvtepu8Epi32 ¶

func M512MaskzCvtepu8Epi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm512_maskz_cvtepu8_epi32'. Requires AVX512F.

func M512MaskzCvtepu8Epi64 ¶

func M512MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm512_maskz_cvtepu8_epi64'. Requires AVX512F.

func M512MaskzCvtpdEpi32 ¶

func M512MaskzCvtpdEpi32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskzCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_maskz_cvtpd_epi32'. Requires AVX512F.

func M512MaskzCvtpdEpu32 ¶

func M512MaskzCvtpdEpu32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskzCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_maskz_cvtpd_epu32'. Requires AVX512F.

func M512MaskzCvtpdPs ¶

func M512MaskzCvtpdPs(k x86.Mmask8, a x86.M512d) (dst x86.M256)

M512MaskzCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_maskz_cvtpd_ps'. Requires AVX512F.

func M512MaskzCvtphPs ¶

func M512MaskzCvtphPs(k x86.Mmask16, a x86.M256i) (dst x86.M512)

M512MaskzCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_maskz_cvtph_ps'. Requires AVX512F.

func M512MaskzCvtpsEpi32 ¶

func M512MaskzCvtpsEpi32(k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskzCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_maskz_cvtps_epi32'. Requires AVX512F.

func M512MaskzCvtpsEpu32 ¶

func M512MaskzCvtpsEpu32(k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskzCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_maskz_cvtps_epu32'. Requires AVX512F.

func M512MaskzCvtpsPd ¶

func M512MaskzCvtpsPd(k x86.Mmask8, a x86.M256) (dst x86.M512d)

M512MaskzCvtpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_maskz_cvtps_pd'. Requires AVX512F.

func M512MaskzCvtpsPh ¶

func M512MaskzCvtpsPh(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)

M512MaskzCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_maskz_cvtps_ph'. Requires AVX512F.

func M512MaskzCvtsepi32Epi16 ¶

func M512MaskzCvtsepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskzCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm512_maskz_cvtsepi32_epi16'. Requires AVX512F.

func M512MaskzCvtsepi32Epi8 ¶

func M512MaskzCvtsepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskzCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm512_maskz_cvtsepi32_epi8'. Requires AVX512F.

func M512MaskzCvtsepi64Epi16 ¶

func M512MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm512_maskz_cvtsepi64_epi16'. Requires AVX512F.

func M512MaskzCvtsepi64Epi32 ¶

func M512MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskzCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm512_maskz_cvtsepi64_epi32'. Requires AVX512F.

func M512MaskzCvtsepi64Epi8 ¶

func M512MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm512_maskz_cvtsepi64_epi8'. Requires AVX512F.

func M512MaskzCvttRoundpdEpi32 ¶

func M512MaskzCvttRoundpdEpi32(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)

M512MaskzCvttRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 32*i
		l := 64*j
		IF k[j]
			dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_maskz_cvtt_roundpd_epi32'. Requires AVX512F.

func M512MaskzCvttRoundpdEpu32 ¶

func M512MaskzCvttRoundpdEpu32(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)

M512MaskzCvttRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 32*i
		l := 64*j
		IF k[j]
			dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_maskz_cvtt_roundpd_epu32'. Requires AVX512F.

func M512MaskzCvttRoundpsEpi32 ¶

func M512MaskzCvttRoundpsEpi32(k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)

M512MaskzCvttRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		IF k[j]
			dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_maskz_cvtt_roundps_epi32'. Requires AVX512F.

func M512MaskzCvttRoundpsEpu32 ¶

func M512MaskzCvttRoundpsEpu32(k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)

M512MaskzCvttRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		IF k[j]
			dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_maskz_cvtt_roundps_epu32'. Requires AVX512F.

func M512MaskzCvttpdEpi32 ¶

func M512MaskzCvttpdEpi32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskzCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_maskz_cvttpd_epi32'. Requires AVX512F.

func M512MaskzCvttpdEpu32 ¶

func M512MaskzCvttpdEpu32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskzCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_maskz_cvttpd_epu32'. Requires AVX512F.

func M512MaskzCvttpsEpi32 ¶

func M512MaskzCvttpsEpi32(k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskzCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_maskz_cvttps_epi32'. Requires AVX512F.

func M512MaskzCvttpsEpu32 ¶

func M512MaskzCvttpsEpu32(k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskzCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_maskz_cvttps_epu32'. Requires AVX512F.

func M512MaskzCvtusepi32Epi16 ¶

func M512MaskzCvtusepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskzCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm512_maskz_cvtusepi32_epi16'. Requires AVX512F.

func M512MaskzCvtusepi32Epi8 ¶

func M512MaskzCvtusepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskzCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm512_maskz_cvtusepi32_epi8'. Requires AVX512F.

func M512MaskzCvtusepi64Epi16 ¶

func M512MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm512_maskz_cvtusepi64_epi16'. Requires AVX512F.

func M512MaskzCvtusepi64Epi32 ¶

func M512MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskzCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm512_maskz_cvtusepi64_epi32'. Requires AVX512F.

func M512MaskzCvtusepi64Epi8 ¶

func M512MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm512_maskz_cvtusepi64_epi8'. Requires AVX512F.

func M512MaskzDivPd ¶

func M512MaskzDivPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_maskz_div_pd'. Requires AVX512F.

func M512MaskzDivPs ¶

func M512MaskzDivPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_maskz_div_ps'. Requires AVX512F.

func M512MaskzDivRoundPd ¶

func M512MaskzDivRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskzDivRoundPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 64*j
			IF k[j]
				dst[i+63:i] := a[i+63:i] / b[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_maskz_div_round_pd'. Requires AVX512F.

func M512MaskzDivRoundPs ¶

func M512MaskzDivRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskzDivRoundPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := a[i+31:i] / b[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_maskz_div_round_ps'. Requires AVX512F.

func M512MaskzExpandEpi32 ¶

func M512MaskzExpandEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskzExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm512_maskz_expand_epi32'. Requires AVX512F.

func M512MaskzExpandEpi64 ¶

func M512MaskzExpandEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskzExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm512_maskz_expand_epi64'. Requires AVX512F.

func M512MaskzExpandPd ¶

func M512MaskzExpandPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm512_maskz_expand_pd'. Requires AVX512F.

func M512MaskzExpandPs ¶

func M512MaskzExpandPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm512_maskz_expand_ps'. Requires AVX512F.

func M512MaskzExtractf32x4Ps ¶

func M512MaskzExtractf32x4Ps(k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M128)

M512MaskzExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm512_maskz_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtractf64x4Pd ¶

func M512MaskzExtractf64x4Pd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M256d)

M512MaskzExtractf64x4Pd: Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTF64X4'. Intrinsic: '_mm512_maskz_extractf64x4_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtracti32x4Epi32 ¶

func M512MaskzExtracti32x4Epi32(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)

M512MaskzExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm512_maskz_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtracti64x4Epi64 ¶

func M512MaskzExtracti64x4Epi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)

M512MaskzExtracti64x4Epi64: Extract 256 bits (composed of 4 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTI64X4'. Intrinsic: '_mm512_maskz_extracti64x4_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzFixupimmPd ¶

func M512MaskzFixupimmPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)

M512MaskzFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_maskz_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzFixupimmPs ¶

func M512MaskzFixupimmPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)

M512MaskzFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_maskz_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzFixupimmRoundPd ¶

func M512MaskzFixupimmRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)

M512MaskzFixupimmRoundPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_maskz_fixupimm_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzFixupimmRoundPs ¶

func M512MaskzFixupimmRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)

M512MaskzFixupimmRoundPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_maskz_fixupimm_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzFmaddPd ¶

func M512MaskzFmaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_maskz_fmadd_pd'. Requires AVX512F.

func M512MaskzFmaddPs ¶

func M512MaskzFmaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_maskz_fmadd_ps'. Requires AVX512F.

func M512MaskzFmaddRoundPd ¶

func M512MaskzFmaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_maskz_fmadd_round_pd'. Requires AVX512F.

func M512MaskzFmaddRoundPs ¶

func M512MaskzFmaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'a' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				a[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_maskz_fmadd_round_ps'. Requires AVX512F.

func M512MaskzFmaddsubPd ¶

func M512MaskzFmaddsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_maskz_fmaddsub_pd'. Requires AVX512F.

func M512MaskzFmaddsubPs ¶

func M512MaskzFmaddsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_maskz_fmaddsub_ps'. Requires AVX512F.

func M512MaskzFmaddsubRoundPd ¶

func M512MaskzFmaddsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_maskz_fmaddsub_round_pd'. Requires AVX512F.

func M512MaskzFmaddsubRoundPs ¶

func M512MaskzFmaddsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_maskz_fmaddsub_round_ps'. Requires AVX512F.

func M512MaskzFmsubPd ¶

func M512MaskzFmsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_maskz_fmsub_pd'. Requires AVX512F.

func M512MaskzFmsubPs ¶

func M512MaskzFmsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_maskz_fmsub_ps'. Requires AVX512F.

func M512MaskzFmsubRoundPd ¶

func M512MaskzFmsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_maskz_fmsub_round_pd'. Requires AVX512F.

func M512MaskzFmsubRoundPs ¶

func M512MaskzFmsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_maskz_fmsub_round_ps'. Requires AVX512F.

func M512MaskzFmsubaddPd ¶

func M512MaskzFmsubaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_maskz_fmsubadd_pd'. Requires AVX512F.

func M512MaskzFmsubaddPs ¶

func M512MaskzFmsubaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_maskz_fmsubadd_ps'. Requires AVX512F.

func M512MaskzFmsubaddRoundPd ¶

func M512MaskzFmsubaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_maskz_fmsubadd_round_pd'. Requires AVX512F.

func M512MaskzFmsubaddRoundPs ¶

func M512MaskzFmsubaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_maskz_fmsubadd_round_ps'. Requires AVX512F.

func M512MaskzFnmaddPd ¶

func M512MaskzFnmaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_maskz_fnmadd_pd'. Requires AVX512F.

func M512MaskzFnmaddPs ¶

func M512MaskzFnmaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_maskz_fnmadd_ps'. Requires AVX512F.

func M512MaskzFnmaddRoundPd ¶

func M512MaskzFnmaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFnmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_maskz_fnmadd_round_pd'. Requires AVX512F.

func M512MaskzFnmaddRoundPs ¶

func M512MaskzFnmaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFnmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_maskz_fnmadd_round_ps'. Requires AVX512F.

func M512MaskzFnmsubPd ¶

func M512MaskzFnmsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_maskz_fnmsub_pd'. Requires AVX512F.

func M512MaskzFnmsubPs ¶

func M512MaskzFnmsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_maskz_fnmsub_ps'. Requires AVX512F.

func M512MaskzFnmsubRoundPd ¶

func M512MaskzFnmsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFnmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_maskz_fnmsub_round_pd'. Requires AVX512F.

func M512MaskzFnmsubRoundPs ¶

func M512MaskzFnmsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFnmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_maskz_fnmsub_round_ps'. Requires AVX512F.

func M512MaskzGetexpPd ¶

func M512MaskzGetexpPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm512_maskz_getexp_pd'. Requires AVX512F.

func M512MaskzGetexpPs ¶

func M512MaskzGetexpPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm512_maskz_getexp_ps'. Requires AVX512F.

func M512MaskzGetexpRoundPd ¶

func M512MaskzGetexpRoundPd(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)

M512MaskzGetexpRoundPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ConvertExpFP64(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm512_maskz_getexp_round_pd'. Requires AVX512F.

func M512MaskzGetexpRoundPs ¶

func M512MaskzGetexpRoundPs(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)

M512MaskzGetexpRoundPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ConvertExpFP32(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm512_maskz_getexp_round_ps'. Requires AVX512F.

func M512MaskzGetmantPd ¶

func M512MaskzGetmantPd(k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512d)

M512MaskzGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm512_maskz_getmant_pd'. Requires AVX512F.

func M512MaskzGetmantPs ¶

func M512MaskzGetmantPs(k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512)

M512MaskzGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm512_maskz_getmant_ps'. Requires AVX512F.

func M512MaskzGetmantRoundPd ¶

func M512MaskzGetmantRoundPd(k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512d)

M512MaskzGetmantRoundPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm512_maskz_getmant_round_pd'. Requires AVX512F.

func M512MaskzGetmantRoundPs ¶

func M512MaskzGetmantRoundPs(k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512)

M512MaskzGetmantRoundPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm512_maskz_getmant_round_ps'. Requires AVX512F.

func M512MaskzInsertf32x4 ¶

func M512MaskzInsertf32x4(k x86.Mmask16, a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)

M512MaskzInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm512_maskz_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzInsertf64x4 ¶

func M512MaskzInsertf64x4(k x86.Mmask8, a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)

M512MaskzInsertf64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF64X4'. Intrinsic: '_mm512_maskz_insertf64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzInserti32x4 ¶

func M512MaskzInserti32x4(k x86.Mmask16, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512MaskzInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm512_maskz_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzInserti64x4 ¶

func M512MaskzInserti64x4(k x86.Mmask8, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512MaskzInserti64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI64X4'. Intrinsic: '_mm512_maskz_inserti64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzMaxEpi32 ¶

func M512MaskzMaxEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm512_maskz_max_epi32'. Requires AVX512F.

func M512MaskzMaxEpi64 ¶

func M512MaskzMaxEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm512_maskz_max_epi64'. Requires AVX512F.

func M512MaskzMaxEpu32 ¶

func M512MaskzMaxEpu32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm512_maskz_max_epu32'. Requires AVX512F.

func M512MaskzMaxEpu64 ¶

func M512MaskzMaxEpu64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm512_maskz_max_epu64'. Requires AVX512F.

func M512MaskzMaxPd ¶

func M512MaskzMaxPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_maskz_max_pd'. Requires AVX512F.

func M512MaskzMaxPs ¶

func M512MaskzMaxPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_maskz_max_ps'. Requires AVX512F.

func M512MaskzMaxRoundPd ¶

func M512MaskzMaxRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MaskzMaxRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
		ELSE
			dst[i+63:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_maskz_max_round_pd'. Requires AVX512F.

func M512MaskzMaxRoundPs ¶

func M512MaskzMaxRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MaskzMaxRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		IF k[j]
			dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_maskz_max_round_ps'. Requires AVX512F.

func M512MaskzMinEpi32 ¶

func M512MaskzMinEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm512_maskz_min_epi32'. Requires AVX512F.

func M512MaskzMinEpi64 ¶

func M512MaskzMinEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm512_maskz_min_epi64'. Requires AVX512F.

func M512MaskzMinEpu32 ¶

func M512MaskzMinEpu32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm512_maskz_min_epu32'. Requires AVX512F.

func M512MaskzMinEpu64 ¶

func M512MaskzMinEpu64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm512_maskz_min_epu64'. Requires AVX512F.

func M512MaskzMinPd ¶

func M512MaskzMinPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_maskz_min_pd'. Requires AVX512F.

func M512MaskzMinPs ¶

func M512MaskzMinPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_maskz_min_ps'. Requires AVX512F.

func M512MaskzMinRoundPd ¶

func M512MaskzMinRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MaskzMinRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
		ELSE
			dst[i+63:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_maskz_min_round_pd'. Requires AVX512F.

func M512MaskzMinRoundPs ¶

func M512MaskzMinRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MaskzMinRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		IF k[j]
			dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_maskz_min_round_ps'. Requires AVX512F.

func M512MaskzMovEpi32 ¶

func M512MaskzMovEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskzMovEpi32: Move packed 32-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm512_maskz_mov_epi32'. Requires AVX512F.

func M512MaskzMovEpi64 ¶

func M512MaskzMovEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskzMovEpi64: Move packed 64-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm512_maskz_mov_epi64'. Requires AVX512F.

func M512MaskzMovPd ¶

func M512MaskzMovPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm512_maskz_mov_pd'. Requires AVX512F.

func M512MaskzMovPs ¶

func M512MaskzMovPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm512_maskz_mov_ps'. Requires AVX512F.

func M512MaskzMovedupPd ¶

func M512MaskzMovedupPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
tmp[319:256] := a[319:256]
tmp[383:320] := a[319:256]
tmp[447:384] := a[447:384]
tmp[511:448] := a[447:384]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm512_maskz_movedup_pd'. Requires AVX512F.

func M512MaskzMovehdupPs ¶

func M512MaskzMovehdupPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
tmp[287:256] := a[319:288]
tmp[319:288] := a[319:288]
tmp[351:320] := a[383:352]
tmp[383:352] := a[383:352]
tmp[415:384] := a[447:416]
tmp[447:416] := a[447:416]
tmp[479:448] := a[511:480]
tmp[511:480] := a[511:480]
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm512_maskz_movehdup_ps'. Requires AVX512F.

func M512MaskzMoveldupPs ¶

func M512MaskzMoveldupPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
tmp[287:256] := a[287:256]
tmp[319:288] := a[287:256]
tmp[351:320] := a[351:320]
tmp[383:352] := a[351:320]
tmp[415:384] := a[415:384]
tmp[447:416] := a[415:384]
tmp[479:448] := a[479:448]
tmp[511:480] := a[479:448]
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm512_maskz_moveldup_ps'. Requires AVX512F.

func M512MaskzMulEpi32 ¶

func M512MaskzMulEpi32(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm512_maskz_mul_epi32'. Requires AVX512F.

func M512MaskzMulEpu32 ¶

func M512MaskzMulEpu32(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm512_maskz_mul_epu32'. Requires AVX512F.

func M512MaskzMulPd ¶

func M512MaskzMulPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm512_maskz_mul_pd'. Requires AVX512F.

func M512MaskzMulPs ¶

func M512MaskzMulPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm512_maskz_mul_ps'. Requires AVX512F.

func M512MaskzMulRoundPd ¶

func M512MaskzMulRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskzMulRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := a[i+63:i] * b[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm512_maskz_mul_round_pd'. Requires AVX512F.

func M512MaskzMulRoundPs ¶

func M512MaskzMulRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskzMulRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := a[i+31:i] * b[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm512_maskz_mul_round_ps'. Requires AVX512F.

func M512MaskzMulloEpi32 ¶

func M512MaskzMulloEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm512_maskz_mullo_epi32'. Requires AVX512F.

func M512MaskzOrEpi32 ¶

func M512MaskzOrEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPORD'. Intrinsic: '_mm512_maskz_or_epi32'. Requires AVX512F.

func M512MaskzOrEpi64 ¶

func M512MaskzOrEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm512_maskz_or_epi64'. Requires AVX512F.

func M512MaskzPermutePd ¶

func M512MaskzPermutePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]
IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]
IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]
IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]
IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]
IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]
IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]
IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_maskz_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzPermutePs ¶

func M512MaskzPermutePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskzPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_maskz_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzPermutevarPd ¶

func M512MaskzPermutevarPd(k x86.Mmask8, a x86.M512d, b x86.M512i) (dst x86.M512d)

M512MaskzPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
IF (b[257] == 0) tmp_dst[319:256] := a[319:256]
IF (b[257] == 1) tmp_dst[319:256] := a[383:320]
IF (b[321] == 0) tmp_dst[383:320] := a[319:256]
IF (b[321] == 1) tmp_dst[383:320] := a[383:320]
IF (b[385] == 0) tmp_dst[447:384] := a[447:384]
IF (b[385] == 1) tmp_dst[447:384] := a[511:448]
IF (b[449] == 0) tmp_dst[511:448] := a[447:384]
IF (b[449] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_maskz_permutevar_pd'. Requires AVX512F.

func M512MaskzPermutevarPs ¶

func M512MaskzPermutevarPs(k x86.Mmask16, a x86.M512, b x86.M512i) (dst x86.M512)

M512MaskzPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_maskz_permutevar_ps'. Requires AVX512F.

func M512MaskzPermutex2varEpi32 ¶

func M512MaskzPermutex2varEpi32(k x86.Mmask16, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm512_maskz_permutex2var_epi32'. Requires AVX512F.

func M512MaskzPermutex2varEpi64 ¶

func M512MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm512_maskz_permutex2var_epi64'. Requires AVX512F.

func M512MaskzPermutex2varPd ¶

func M512MaskzPermutex2varPd(k x86.Mmask8, a x86.M512d, idx x86.M512i, b x86.M512d) (dst x86.M512d)

M512MaskzPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm512_maskz_permutex2var_pd'. Requires AVX512F.

func M512MaskzPermutex2varPs ¶

func M512MaskzPermutex2varPs(k x86.Mmask16, a x86.M512, idx x86.M512i, b x86.M512) (dst x86.M512)

M512MaskzPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm512_maskz_permutex2var_ps'. Requires AVX512F.

func M512MaskzPermutexEpi64 ¶

func M512MaskzPermutexEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzPermutexEpi64: Shuffle 64-bit integers in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_maskz_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzPermutexPd ¶

func M512MaskzPermutexPd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_maskz_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzPermutexvarEpi32 ¶

func M512MaskzPermutexvarEpi32(k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskzPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm512_maskz_permutexvar_epi32'. Requires AVX512F.

func M512MaskzPermutexvarEpi64 ¶

func M512MaskzPermutexvarEpi64(k x86.Mmask8, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskzPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_maskz_permutexvar_epi64'. Requires AVX512F.

func M512MaskzPermutexvarPd ¶

func M512MaskzPermutexvarPd(k x86.Mmask8, idx x86.M512i, a x86.M512d) (dst x86.M512d)

M512MaskzPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_maskz_permutexvar_pd'. Requires AVX512F.

func M512MaskzPermutexvarPs ¶

func M512MaskzPermutexvarPs(k x86.Mmask16, idx x86.M512i, a x86.M512) (dst x86.M512)

M512MaskzPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm512_maskz_permutexvar_ps'. Requires AVX512F.

func M512MaskzRcp14Pd ¶

func M512MaskzRcp14Pd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm512_maskz_rcp14_pd'. Requires AVX512F.

func M512MaskzRcp14Ps ¶

func M512MaskzRcp14Ps(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm512_maskz_rcp14_ps'. Requires AVX512F.

func M512MaskzRolEpi32 ¶

func M512MaskzRolEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm512_maskz_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRolEpi64 ¶

func M512MaskzRolEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm512_maskz_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRolvEpi32 ¶

func M512MaskzRolvEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm512_maskz_rolv_epi32'. Requires AVX512F.

func M512MaskzRolvEpi64 ¶

func M512MaskzRolvEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm512_maskz_rolv_epi64'. Requires AVX512F.

func M512MaskzRorEpi32 ¶

func M512MaskzRorEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm512_maskz_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRorEpi64 ¶

func M512MaskzRorEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm512_maskz_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRorvEpi32 ¶

func M512MaskzRorvEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm512_maskz_rorv_epi32'. Requires AVX512F.

func M512MaskzRorvEpi64 ¶

func M512MaskzRorvEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm512_maskz_rorv_epi64'. Requires AVX512F.

func M512MaskzRoundscalePd ¶

func M512MaskzRoundscalePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_maskz_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRoundscalePs ¶

func M512MaskzRoundscalePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskzRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_maskz_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRoundscaleRoundPd ¶

func M512MaskzRoundscaleRoundPd(k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskzRoundscaleRoundPd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_maskz_roundscale_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRoundscaleRoundPs ¶

func M512MaskzRoundscaleRoundPs(k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskzRoundscaleRoundPs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_maskz_roundscale_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRsqrt14Pd ¶

func M512MaskzRsqrt14Pd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm512_maskz_rsqrt14_pd'. Requires AVX512F.

func M512MaskzRsqrt14Ps ¶

func M512MaskzRsqrt14Ps(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm512_maskz_rsqrt14_ps'. Requires AVX512F.

func M512MaskzScalefPd ¶

func M512MaskzScalefPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_maskz_scalef_pd'. Requires AVX512F.

func M512MaskzScalefPs ¶

func M512MaskzScalefPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_maskz_scalef_ps'. Requires AVX512F.

func M512MaskzScalefRoundPd ¶

func M512MaskzScalefRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskzScalefRoundPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_maskz_scalef_round_pd'. Requires AVX512F.

func M512MaskzScalefRoundPs ¶

func M512MaskzScalefRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskzScalefRoundPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_maskz_scalef_round_ps'. Requires AVX512F.

func M512MaskzSet1Epi32 ¶

func M512MaskzSet1Epi32(k x86.Mmask16, a int) (dst x86.M512i)

M512MaskzSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_maskz_set1_epi32'. Requires AVX512F.

func M512MaskzSet1Epi64 ¶

func M512MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M512i)

M512MaskzSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_maskz_set1_epi64'. Requires AVX512F.

func M512MaskzShuffleEpi32 ¶

func M512MaskzShuffleEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm512_maskz_shuffle_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShuffleF32x4 ¶

func M512MaskzShuffleF32x4(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskzShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm512_maskz_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShuffleF64x2 ¶

func M512MaskzShuffleF64x2(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm512_maskz_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShuffleI32x4 ¶

func M512MaskzShuffleI32x4(k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm512_maskz_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShuffleI64x2 ¶

func M512MaskzShuffleI64x2(k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm512_maskz_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShufflePd ¶

func M512MaskzShufflePd(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm512_maskz_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShufflePs ¶

func M512MaskzShufflePs(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskzShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm512_maskz_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSllEpi32 ¶

func M512MaskzSllEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm512_maskz_sll_epi32'. Requires AVX512F.

func M512MaskzSllEpi64 ¶

func M512MaskzSllEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_maskz_sll_epi64'. Requires AVX512F.

func M512MaskzSlliEpi32 ¶

func M512MaskzSlliEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm512_maskz_slli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSlliEpi64 ¶

func M512MaskzSlliEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_maskz_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSllvEpi32 ¶

func M512MaskzSllvEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm512_maskz_sllv_epi32'. Requires AVX512F.

func M512MaskzSllvEpi64 ¶

func M512MaskzSllvEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm512_maskz_sllv_epi64'. Requires AVX512F.

func M512MaskzSqrtPd ¶

func M512MaskzSqrtPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_maskz_sqrt_pd'. Requires AVX512F.

func M512MaskzSqrtPs ¶

func M512MaskzSqrtPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_maskz_sqrt_ps'. Requires AVX512F.

func M512MaskzSqrtRoundPd ¶

func M512MaskzSqrtRoundPd(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)

M512MaskzSqrtRoundPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE.

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := SQRT(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_maskz_sqrt_round_pd'. Requires AVX512F.

func M512MaskzSqrtRoundPs ¶

func M512MaskzSqrtRoundPs(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)

M512MaskzSqrtRoundPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := SQRT(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_maskz_sqrt_round_ps'. Requires AVX512F.

func M512MaskzSraEpi32 ¶

func M512MaskzSraEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm512_maskz_sra_epi32'. Requires AVX512F.

func M512MaskzSraEpi64 ¶

func M512MaskzSraEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_maskz_sra_epi64'. Requires AVX512F.

func M512MaskzSraiEpi32 ¶

func M512MaskzSraiEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm512_maskz_srai_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSraiEpi64 ¶

func M512MaskzSraiEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_maskz_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSravEpi32 ¶

func M512MaskzSravEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm512_maskz_srav_epi32'. Requires AVX512F.

func M512MaskzSravEpi64 ¶

func M512MaskzSravEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm512_maskz_srav_epi64'. Requires AVX512F.

func M512MaskzSrlEpi32 ¶

func M512MaskzSrlEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm512_maskz_srl_epi32'. Requires AVX512F.

func M512MaskzSrlEpi64 ¶

func M512MaskzSrlEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_maskz_srl_epi64'. Requires AVX512F.

func M512MaskzSrliEpi32 ¶

func M512MaskzSrliEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm512_maskz_srli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSrliEpi64 ¶

func M512MaskzSrliEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_maskz_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSrlvEpi32 ¶

func M512MaskzSrlvEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm512_maskz_srlv_epi32'. Requires AVX512F.

func M512MaskzSrlvEpi64 ¶

func M512MaskzSrlvEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm512_maskz_srlv_epi64'. Requires AVX512F.

func M512MaskzSubEpi32 ¶

func M512MaskzSubEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm512_maskz_sub_epi32'. Requires AVX512F.

func M512MaskzSubEpi64 ¶

func M512MaskzSubEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm512_maskz_sub_epi64'. Requires AVX512F.

func M512MaskzSubPd ¶

func M512MaskzSubPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm512_maskz_sub_pd'. Requires AVX512F.

func M512MaskzSubPs ¶

func M512MaskzSubPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm512_maskz_sub_ps'. Requires AVX512F.

func M512MaskzSubRoundPd ¶

func M512MaskzSubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskzSubRoundPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := a[i+63:i] - b[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm512_maskz_sub_round_pd'. Requires AVX512F.

func M512MaskzSubRoundPs ¶

func M512MaskzSubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskzSubRoundPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := a[i+31:i] - b[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm512_maskz_sub_round_ps'. Requires AVX512F.

func M512MaskzTernarylogicEpi32 ¶

func M512MaskzTernarylogicEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm512_maskz_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzTernarylogicEpi64 ¶

func M512MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm512_maskz_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzUnpackhiEpi32 ¶

func M512MaskzUnpackhiEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm512_maskz_unpackhi_epi32'. Requires AVX512F.

func M512MaskzUnpackhiEpi64 ¶

func M512MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm512_maskz_unpackhi_epi64'. Requires AVX512F.

func M512MaskzUnpackhiPd ¶

func M512MaskzUnpackhiPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm512_maskz_unpackhi_pd'. Requires AVX512F.

func M512MaskzUnpackhiPs ¶

func M512MaskzUnpackhiPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm512_maskz_unpackhi_ps'. Requires AVX512F.

func M512MaskzUnpackloEpi32 ¶

func M512MaskzUnpackloEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm512_maskz_unpacklo_epi32'. Requires AVX512F.

func M512MaskzUnpackloEpi64 ¶

func M512MaskzUnpackloEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm512_maskz_unpacklo_epi64'. Requires AVX512F.

func M512MaskzUnpackloPd ¶

func M512MaskzUnpackloPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm512_maskz_unpacklo_pd'. Requires AVX512F.

func M512MaskzUnpackloPs ¶

func M512MaskzUnpackloPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm512_maskz_unpacklo_ps'. Requires AVX512F.

func M512MaskzXorEpi32 ¶

func M512MaskzXorEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm512_maskz_xor_epi32'. Requires AVX512F.

func M512MaskzXorEpi64 ¶

func M512MaskzXorEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_maskz_xor_epi64'. Requires AVX512F.

func M512MaxEpi64 ¶

func M512MaxEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm512_max_epi64'. Requires AVX512F.

func M512MaxEpu64 ¶

func M512MaxEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm512_max_epu64'. Requires AVX512F.

func M512MaxPd ¶

func M512MaxPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_max_pd'. Requires AVX512F.

func M512MaxPs ¶

func M512MaxPs(a x86.M512, b x86.M512) (dst x86.M512)

M512MaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_max_ps'. Requires AVX512F.

func M512MaxRoundPd ¶

func M512MaxRoundPd(a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MaxRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_max_round_pd'. Requires AVX512F.

func M512MaxRoundPs ¶

func M512MaxRoundPs(a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MaxRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_max_round_ps'. Requires AVX512F.

func M512MinEpi64 ¶

func M512MinEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm512_min_epi64'. Requires AVX512F.

func M512MinEpu64 ¶

func M512MinEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm512_min_epu64'. Requires AVX512F.

func M512MinPd ¶

func M512MinPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_min_pd'. Requires AVX512F.

func M512MinPs ¶

func M512MinPs(a x86.M512, b x86.M512) (dst x86.M512)

M512MinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_min_ps'. Requires AVX512F.

func M512MinRoundPd ¶

func M512MinRoundPd(a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MinRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_min_round_pd'. Requires AVX512F.

func M512MinRoundPs ¶

func M512MinRoundPs(a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MinRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_min_round_ps'. Requires AVX512F.

func M512MovedupPd ¶

func M512MovedupPd(a x86.M512d) (dst x86.M512d)

M512MovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst'.

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
tmp[319:256] := a[319:256]
tmp[383:320] := a[319:256]
tmp[447:384] := a[447:384]
tmp[511:448] := a[447:384]
dst[MAX:512] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm512_movedup_pd'. Requires AVX512F.

func M512MovehdupPs ¶

func M512MovehdupPs(a x86.M512) (dst x86.M512)

M512MovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst'.

dst[31:0] := a[63:32]
dst[63:32] := a[63:32]
dst[95:64] := a[127:96]
dst[127:96] := a[127:96]
dst[159:128] := a[191:160]
dst[191:160] := a[191:160]
dst[223:192] := a[255:224]
dst[255:224] := a[255:224]
dst[287:256] := a[319:288]
dst[319:288] := a[319:288]
dst[351:320] := a[383:352]
dst[383:352] := a[383:352]
dst[415:384] := a[447:416]
dst[447:416] := a[447:416]
dst[479:448] := a[511:480]
dst[511:480] := a[511:480]
dst[MAX:512] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm512_movehdup_ps'. Requires AVX512F.

func M512MoveldupPs ¶

func M512MoveldupPs(a x86.M512) (dst x86.M512)

M512MoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst'.

dst[31:0] := a[31:0]
dst[63:32] := a[31:0]
dst[95:64] := a[95:64]
dst[127:96] := a[95:64]
dst[159:128] := a[159:128]
dst[191:160] := a[159:128]
dst[223:192] := a[223:192]
dst[255:224] := a[223:192]
dst[287:256] := a[287:256]
dst[319:288] := a[287:256]
dst[351:320] := a[351:320]
dst[383:352] := a[351:320]
dst[415:384] := a[415:384]
dst[447:416] := a[415:384]
dst[479:448] := a[479:448]
dst[511:480] := a[479:448]
dst[MAX:512] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm512_moveldup_ps'. Requires AVX512F.

func M512MulEpi32 ¶

func M512MulEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm512_mul_epi32'. Requires AVX512F.

func M512MulEpu32 ¶

func M512MulEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm512_mul_epu32'. Requires AVX512F.

func M512MulloxEpi64 ¶

func M512MulloxEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulloxEpi64: Multiplies elements in packed 64-bit integer vectors 'a' and 'b' together, storing the lower 64 bits of the result in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] * b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mullox_epi64'. Requires AVX512F.

func M512NearbyintPd ¶

func M512NearbyintPd(a x86.M512d) (dst x86.M512d)

M512NearbyintPd: Rounds each packed double-precision (64-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := NearbyInt(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_nearbyint_pd'. Requires AVX512F.

func M512NearbyintPs ¶

func M512NearbyintPs(a x86.M512) (dst x86.M512)

M512NearbyintPs: Rounds each packed single-precision (32-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := NearbyInt(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_nearbyint_ps'. Requires AVX512F.

func M512PermutePd ¶

func M512PermutePd(a x86.M512d, imm8 byte) (dst x86.M512d)

M512PermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.

IF (imm8[0] == 0) dst[63:0] := a[63:0]
IF (imm8[0] == 1) dst[63:0] := a[127:64]
IF (imm8[1] == 0) dst[127:64] := a[63:0]
IF (imm8[1] == 1) dst[127:64] := a[127:64]
IF (imm8[2] == 0) dst[191:128] := a[191:128]
IF (imm8[2] == 1) dst[191:128] := a[255:192]
IF (imm8[3] == 0) dst[255:192] := a[191:128]
IF (imm8[3] == 1) dst[255:192] := a[255:192]
IF (imm8[4] == 0) dst[319:256] := a[319:256]
IF (imm8[4] == 1) dst[319:256] := a[383:320]
IF (imm8[5] == 0) dst[383:320] := a[319:256]
IF (imm8[5] == 1) dst[383:320] := a[383:320]
IF (imm8[6] == 0) dst[447:384] := a[447:384]
IF (imm8[6] == 1) dst[447:384] := a[511:448]
IF (imm8[7] == 0) dst[511:448] := a[447:384]
IF (imm8[7] == 1) dst[511:448] := a[511:448]
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512PermutePs ¶

func M512PermutePs(a x86.M512, imm8 byte) (dst x86.M512)

M512PermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(a[255:128], imm8[5:4])
dst[255:224] := SELECT4(a[255:128], imm8[7:6])
dst[287:256] := SELECT4(a[383:256], imm8[1:0])
dst[319:288] := SELECT4(a[383:256], imm8[3:2])
dst[351:320] := SELECT4(a[383:256], imm8[5:4])
dst[383:352] := SELECT4(a[383:256], imm8[7:6])
dst[415:384] := SELECT4(a[511:384], imm8[1:0])
dst[447:416] := SELECT4(a[511:384], imm8[3:2])
dst[479:448] := SELECT4(a[511:384], imm8[5:4])
dst[511:480] := SELECT4(a[511:384], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512PermutevarPd ¶

func M512PermutevarPd(a x86.M512d, b x86.M512i) (dst x86.M512d)

M512PermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst'.

IF (b[1] == 0) dst[63:0] := a[63:0]
IF (b[1] == 1) dst[63:0] := a[127:64]
IF (b[65] == 0) dst[127:64] := a[63:0]
IF (b[65] == 1) dst[127:64] := a[127:64]
IF (b[129] == 0) dst[191:128] := a[191:128]
IF (b[129] == 1) dst[191:128] := a[255:192]
IF (b[193] == 0) dst[255:192] := a[191:128]
IF (b[193] == 1) dst[255:192] := a[255:192]
IF (b[257] == 0) dst[319:256] := a[319:256]
IF (b[257] == 1) dst[319:256] := a[383:320]
IF (b[321] == 0) dst[383:320] := a[319:256]
IF (b[321] == 1) dst[383:320] := a[383:320]
IF (b[385] == 0) dst[447:384] := a[447:384]
IF (b[385] == 1) dst[447:384] := a[511:448]
IF (b[449] == 0) dst[511:448] := a[447:384]
IF (b[449] == 1) dst[511:448] := a[511:448]
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_permutevar_pd'. Requires AVX512F.

func M512PermutevarPs ¶

func M512PermutevarPs(a x86.M512, b x86.M512i) (dst x86.M512)

M512PermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], b[1:0])
dst[63:32] := SELECT4(a[127:0], b[33:32])
dst[95:64] := SELECT4(a[127:0], b[65:64])
dst[127:96] := SELECT4(a[127:0], b[97:96])
dst[159:128] := SELECT4(a[255:128], b[129:128])
dst[191:160] := SELECT4(a[255:128], b[161:160])
dst[223:192] := SELECT4(a[255:128], b[193:192])
dst[255:224] := SELECT4(a[255:128], b[225:224])
dst[287:256] := SELECT4(a[383:256], b[257:256])
dst[319:288] := SELECT4(a[383:256], b[289:288])
dst[351:320] := SELECT4(a[383:256], b[321:320])
dst[383:352] := SELECT4(a[383:256], b[353:352])
dst[415:384] := SELECT4(a[511:384], b[385:384])
dst[447:416] := SELECT4(a[511:384], b[417:416])
dst[479:448] := SELECT4(a[511:384], b[449:448])
dst[511:480] := SELECT4(a[511:384], b[481:480])
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_permutevar_ps'. Requires AVX512F.

func M512Permutex2varEpi32 ¶

func M512Permutex2varEpi32(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm512_permutex2var_epi32'. Requires AVX512F.

func M512Permutex2varEpi64 ¶

func M512Permutex2varEpi64(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm512_permutex2var_epi64'. Requires AVX512F.

func M512Permutex2varPd ¶

func M512Permutex2varPd(a x86.M512d, idx x86.M512i, b x86.M512d) (dst x86.M512d)

M512Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm512_permutex2var_pd'. Requires AVX512F.

func M512Permutex2varPs ¶

func M512Permutex2varPs(a x86.M512, idx x86.M512i, b x86.M512) (dst x86.M512)

M512Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm512_permutex2var_ps'. Requires AVX512F.

func M512PermutexEpi64 ¶

func M512PermutexEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512PermutexEpi64: Shuffle 64-bit integers in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[319:256] := SELECT4(a[511:256], imm8[1:0])
dst[383:320] := SELECT4(a[511:256], imm8[3:2])
dst[447:384] := SELECT4(a[511:256], imm8[5:4])
dst[511:448] := SELECT4(a[511:256], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512PermutexPd ¶

func M512PermutexPd(a x86.M512d, imm8 byte) (dst x86.M512d)

M512PermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[319:256] := SELECT4(a[511:256], imm8[1:0])
dst[383:320] := SELECT4(a[511:256], imm8[3:2])
dst[447:384] := SELECT4(a[511:256], imm8[5:4])
dst[511:448] := SELECT4(a[511:256], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512PermutexvarEpi32 ¶

func M512PermutexvarEpi32(idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512PermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm512_permutexvar_epi32'. Requires AVX512F.

func M512PermutexvarEpi64 ¶

func M512PermutexvarEpi64(idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512PermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_permutexvar_epi64'. Requires AVX512F.

func M512PermutexvarPd ¶

func M512PermutexvarPd(idx x86.M512i, a x86.M512d) (dst x86.M512d)

M512PermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_permutexvar_pd'. Requires AVX512F.

func M512PermutexvarPs ¶

func M512PermutexvarPs(idx x86.M512i, a x86.M512) (dst x86.M512)

M512PermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx'.

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm512_permutexvar_ps'. Requires AVX512F.

func M512PowPd ¶

func M512PowPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512PowPd: Compute the exponential value of packed double-precision (64-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_pow_pd'. Requires AVX512F.

func M512PowPs ¶

func M512PowPs(a x86.M512, b x86.M512) (dst x86.M512)

M512PowPs: Compute the exponential value of packed single-precision (32-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_pow_ps'. Requires AVX512F.

func M512Rcp14Pd ¶

func M512Rcp14Pd(a x86.M512d) (dst x86.M512d)

M512Rcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm512_rcp14_pd'. Requires AVX512F.

func M512Rcp14Ps ¶

func M512Rcp14Ps(a x86.M512) (dst x86.M512)

M512Rcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm512_rcp14_ps'. Requires AVX512F.

func M512RecipPd ¶

func M512RecipPd(a x86.M512d) (dst x86.M512d)

M512RecipPd: Computes the reciprocal of packed double-precision (64-bit) floating-point elements in 'a', storing the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := (1 / a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_recip_pd'. Requires AVX512F.

func M512RecipPs ¶

func M512RecipPs(a x86.M512) (dst x86.M512)

M512RecipPs: Computes the reciprocal of packed single-precision (32-bit) floating-point elements in 'a', storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (1 / a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_recip_ps'. Requires AVX512F.

func M512RemEpi16 ¶

func M512RemEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpi16: Divide packed 16-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 31
	i := 16*j
	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epi16'. Requires AVX512F.

func M512RemEpi32 ¶

func M512RemEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epi32'. Requires AVX512F.

func M512RemEpi64 ¶

func M512RemEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpi64: Divide packed 64-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 7
	i := 64*j
	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epi64'. Requires AVX512F.

func M512RemEpi8 ¶

func M512RemEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpi8: Divide packed 8-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 63
	i := 8*j
	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epi8'. Requires AVX512F.

func M512RemEpu16 ¶

func M512RemEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpu16: Divide packed unsigned 16-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 31
	i := 16*j
	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epu16'. Requires AVX512F.

func M512RemEpu32 ¶

func M512RemEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epu32'. Requires AVX512F.

func M512RemEpu64 ¶

func M512RemEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpu64: Divide packed unsigned 64-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 7
	i := 64*j
	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epu64'. Requires AVX512F.

func M512RemEpu8 ¶

func M512RemEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpu8: Divide packed unsigned 8-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 63
	i := 8*j
	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epu8'. Requires AVX512F.

func M512RintPd ¶

func M512RintPd(a x86.M512d) (dst x86.M512d)

M512RintPd: Rounds the packed double-precision (64-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := RoundToNearestEven(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rint_pd'. Requires AVX512F.

func M512RintPs ¶

func M512RintPs(a x86.M512) (dst x86.M512)

M512RintPs: Rounds the packed single-precision (32-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := RoundToNearestEven(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rint_ps'. Requires AVX512F.

func M512RolEpi32 ¶

func M512RolEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)

M512RolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm512_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RolEpi64 ¶

func M512RolEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512RolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm512_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RolvEpi32 ¶

func M512RolvEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm512_rolv_epi32'. Requires AVX512F.

func M512RolvEpi64 ¶

func M512RolvEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm512_rolv_epi64'. Requires AVX512F.

func M512RorEpi32 ¶

func M512RorEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)

M512RorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm512_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RorEpi64 ¶

func M512RorEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512RorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm512_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RorvEpi32 ¶

func M512RorvEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm512_rorv_epi32'. Requires AVX512F.

func M512RorvEpi64 ¶

func M512RorvEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm512_rorv_epi64'. Requires AVX512F.

func M512RoundscalePd ¶

func M512RoundscalePd(a x86.M512d, imm8 byte) (dst x86.M512d)

M512RoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RoundscalePs ¶

func M512RoundscalePs(a x86.M512, imm8 byte) (dst x86.M512)

M512RoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RoundscaleRoundPd ¶

func M512RoundscaleRoundPd(a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512RoundscaleRoundPd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_roundscale_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RoundscaleRoundPs ¶

func M512RoundscaleRoundPs(a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512RoundscaleRoundPs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_roundscale_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Rsqrt14Pd ¶

func M512Rsqrt14Pd(a x86.M512d) (dst x86.M512d)

M512Rsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm512_rsqrt14_pd'. Requires AVX512F.

func M512Rsqrt14Ps ¶

func M512Rsqrt14Ps(a x86.M512) (dst x86.M512)

M512Rsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm512_rsqrt14_ps'. Requires AVX512F.

func M512ScalefPd ¶

func M512ScalefPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512ScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_scalef_pd'. Requires AVX512F.

func M512ScalefPs ¶

func M512ScalefPs(a x86.M512, b x86.M512) (dst x86.M512)

M512ScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_scalef_ps'. Requires AVX512F.

func M512ScalefRoundPd ¶

func M512ScalefRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512ScalefRoundPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_scalef_round_pd'. Requires AVX512F.

func M512ScalefRoundPs ¶

func M512ScalefRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512ScalefRoundPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_scalef_round_ps'. Requires AVX512F.

func M512Set1Epi16 ¶

func M512Set1Epi16(a int16) (dst x86.M512i)

M512Set1Epi16: Broadcast the low packed 16-bit integer from 'a' to all all elements of 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set1_epi16'. Requires AVX512F.

func M512Set1Epi32 ¶

func M512Set1Epi32(a int) (dst x86.M512i)

M512Set1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_set1_epi32'. Requires AVX512F.

func M512Set1Epi64 ¶

func M512Set1Epi64(a int64) (dst x86.M512i)

M512Set1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_set1_epi64'. Requires AVX512F.

func M512Set1Epi8 ¶

func M512Set1Epi8(a byte) (dst x86.M512i)

M512Set1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set1_epi8'. Requires AVX512F.

func M512Set1Pd ¶

func M512Set1Pd(a float64) (dst x86.M512d)

M512Set1Pd: Broadcast double-precision (64-bit) floating-point value 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set1_pd'. Requires AVX512F.

func M512Set1Ps ¶

func M512Set1Ps(a float32) (dst x86.M512)

M512Set1Ps: Broadcast single-precision (32-bit) floating-point value 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set1_ps'. Requires AVX512F.

func M512Set4Epi32 ¶

func M512Set4Epi32(d int, c int, b int, a int) (dst x86.M512i)

M512Set4Epi32: Set packed 32-bit integers in 'dst' with the repeated 4 element sequence.

dst[31:0] := d
dst[63:32] := c
dst[95:64] := b
dst[127:96] := a
dst[159:128] := d
dst[191:160] := c
dst[223:192] := b
dst[255:224] := a
dst[287:256] := d
dst[319:288] := c
dst[351:320] := b
dst[383:352] := a
dst[415:384] := d
dst[447:416] := c
dst[479:448] := b
dst[511:480] := a
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set4_epi32'. Requires AVX512F.

func M512Set4Epi64 ¶

func M512Set4Epi64(d int64, c int64, b int64, a int64) (dst x86.M512i)

M512Set4Epi64: Set packed 64-bit integers in 'dst' with the repeated 4 element sequence.

dst[63:0] := d
dst[127:64] := c
dst[191:128] := b
dst[255:192] := a
dst[319:256] := d
dst[383:320] := c
dst[447:384] := b
dst[511:448] := a
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set4_epi64'. Requires AVX512F.

func M512Set4Pd ¶

func M512Set4Pd(d float64, c float64, b float64, a float64) (dst x86.M512d)

M512Set4Pd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the repeated 4 element sequence.

dst[63:0] := d
dst[127:64] := c
dst[191:128] := b
dst[255:192] := a
dst[319:256] := d
dst[383:320] := c
dst[447:384] := b
dst[511:448] := a
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set4_pd'. Requires AVX512F.

func M512Set4Ps ¶

func M512Set4Ps(d float32, c float32, b float32, a float32) (dst x86.M512)

M512Set4Ps: Set packed single-precision (32-bit) floating-point elements in 'dst' with the repeated 4 element sequence.

dst[31:0] := d
dst[63:32] := c
dst[95:64] := b
dst[127:96] := a
dst[159:128] := d
dst[191:160] := c
dst[223:192] := b
dst[255:224] := a
dst[287:256] := d
dst[319:288] := c
dst[351:320] := b
dst[383:352] := a
dst[415:384] := d
dst[447:416] := c
dst[479:448] := b
dst[511:480] := a
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set4_ps'. Requires AVX512F.

func M512SetEpi32 ¶

func M512SetEpi32(e15 int, e14 int, e13 int, e12 int, e11 int, e10 int, e9 int, e8 int, e7 int, e6 int, e5 int, e4 int, e3 int, e2 int, e1 int, e0 int) (dst x86.M512i)

M512SetEpi32: Set packed 32-bit integers in 'dst' with the supplied values.

dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
dst[159:128] := e4
dst[191:160] := e5
dst[223:192] := e6
dst[255:224] := e7
dst[287:256] := e8
dst[319:288] := e9
dst[351:320] := e10
dst[383:352] := e11
dst[415:384] := e12
dst[447:416] := e13
dst[479:448] := e14
dst[511:480] := e15
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set_epi32'. Requires AVX512F.

func M512SetEpi64 ¶

func M512SetEpi64(e7 int64, e6 int64, e5 int64, e4 int64, e3 int64, e2 int64, e1 int64, e0 int64) (dst x86.M512i)

M512SetEpi64: Set packed 64-bit integers in 'dst' with the supplied values.

dst[63:0] := e0
dst[127:64] := e1
dst[191:128] := e2
dst[255:192] := e3
dst[319:256] := e4
dst[383:320] := e5
dst[447:384] := e6
dst[511:448] := e7
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set_epi64'. Requires AVX512F.

func M512SetPd ¶

func M512SetPd(e7 float64, e6 float64, e5 float64, e4 float64, e3 float64, e2 float64, e1 float64, e0 float64) (dst x86.M512d)

M512SetPd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the supplied values.

dst[63:0] := e0
dst[127:64] := e1
dst[191:128] := e2
dst[255:192] := e3
dst[319:256] := e4
dst[383:320] := e5
dst[447:384] := e6
dst[511:448] := e7
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set_pd'. Requires AVX512F.

func M512SetPs ¶

func M512SetPs(e15 float32, e14 float32, e13 float32, e12 float32, e11 float32, e10 float32, e9 float32, e8 float32, e7 float32, e6 float32, e5 float32, e4 float32, e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M512)

M512SetPs: Set packed single-precision (32-bit) floating-point elements in 'dst' with the supplied values.

dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
dst[159:128] := e4
dst[191:160] := e5
dst[223:192] := e6
dst[255:224] := e7
dst[287:256] := e8
dst[319:288] := e9
dst[351:320] := e10
dst[383:352] := e11
dst[415:384] := e12
dst[447:416] := e13
dst[479:448] := e14
dst[511:480] := e15
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set_ps'. Requires AVX512F.

func M512Setr4Epi32 ¶

func M512Setr4Epi32(d int, c int, b int, a int) (dst x86.M512i)

M512Setr4Epi32: Set packed 32-bit integers in 'dst' with the repeated 4 element sequence in reverse order.

dst[31:0] := a
dst[63:32] := b
dst[95:64] := c
dst[127:96] := d
dst[159:128] := a
dst[191:160] := b
dst[223:192] := c
dst[255:224] := d
dst[287:256] := a
dst[319:288] := b
dst[351:320] := c
dst[383:352] := d
dst[415:384] := a
dst[447:416] := b
dst[479:448] := c
dst[511:480] := d
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr4_epi32'. Requires AVX512F.

func M512Setr4Epi64 ¶

func M512Setr4Epi64(d int64, c int64, b int64, a int64) (dst x86.M512i)

M512Setr4Epi64: Set packed 64-bit integers in 'dst' with the repeated 4 element sequence in reverse order.

dst[63:0] := a
dst[127:64] := b
dst[191:128] := c
dst[255:192] := d
dst[319:256] := a
dst[383:320] := b
dst[447:384] := c
dst[511:448] := d
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr4_epi64'. Requires AVX512F.

func M512Setr4Pd ¶

func M512Setr4Pd(d float64, c float64, b float64, a float64) (dst x86.M512d)

M512Setr4Pd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the repeated 4 element sequence in reverse order.

dst[63:0] := a
dst[127:64] := b
dst[191:128] := c
dst[255:192] := d
dst[319:256] := a
dst[383:320] := b
dst[447:384] := c
dst[511:448] := d
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr4_pd'. Requires AVX512F.

func M512Setr4Ps ¶

func M512Setr4Ps(d float32, c float32, b float32, a float32) (dst x86.M512)

M512Setr4Ps: Set packed single-precision (32-bit) floating-point elements in 'dst' with the repeated 4 element sequence in reverse order.

dst[31:0] := a
dst[63:32] := b
dst[95:64] := c
dst[127:96] := d
dst[159:128] := a
dst[191:160] := b
dst[223:192] := c
dst[255:224] := d
dst[287:256] := a
dst[319:288] := b
dst[351:320] := c
dst[383:352] := d
dst[415:384] := a
dst[447:416] := b
dst[479:448] := c
dst[511:480] := d
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr4_ps'. Requires AVX512F.

func M512SetrEpi32 ¶

func M512SetrEpi32(e15 int, e14 int, e13 int, e12 int, e11 int, e10 int, e9 int, e8 int, e7 int, e6 int, e5 int, e4 int, e3 int, e2 int, e1 int, e0 int) (dst x86.M512i)

M512SetrEpi32: Set packed 32-bit integers in 'dst' with the supplied values in reverse order.

dst[31:0] := e15
dst[63:32] := e14
dst[95:64] := e13
dst[127:96] := e12
dst[159:128] := e11
dst[191:160] := e10
dst[223:192] := e9
dst[255:224] := e8
dst[287:256] := e7
dst[319:288] := e6
dst[351:320] := e5
dst[383:352] := e4
dst[415:384] := e3
dst[447:416] := e2
dst[479:448] := e1
dst[511:480] := e0
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr_epi32'. Requires AVX512F.

func M512SetrEpi64 ¶

func M512SetrEpi64(e7 int64, e6 int64, e5 int64, e4 int64, e3 int64, e2 int64, e1 int64, e0 int64) (dst x86.M512i)

M512SetrEpi64: Set packed 64-bit integers in 'dst' with the supplied values in reverse order.

dst[63:0] := e7
dst[127:64] := e6
dst[191:128] := e5
dst[255:192] := e4
dst[319:256] := e3
dst[383:320] := e2
dst[447:384] := e1
dst[511:448] := e0
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr_epi64'. Requires AVX512F.

func M512SetrPd ¶

func M512SetrPd(e7 float64, e6 float64, e5 float64, e4 float64, e3 float64, e2 float64, e1 float64, e0 float64) (dst x86.M512d)

M512SetrPd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the supplied values in reverse order.

dst[63:0] := e7
dst[127:64] := e6
dst[191:128] := e5
dst[255:192] := e4
dst[319:256] := e3
dst[383:320] := e2
dst[447:384] := e1
dst[511:448] := e0
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr_pd'. Requires AVX512F.

func M512SetrPs ¶

func M512SetrPs(e15 float32, e14 float32, e13 float32, e12 float32, e11 float32, e10 float32, e9 float32, e8 float32, e7 float32, e6 float32, e5 float32, e4 float32, e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M512)

M512SetrPs: Set packed single-precision (32-bit) floating-point elements in 'dst' with the supplied values in reverse order.

dst[31:0] := e15
dst[63:32] := e14
dst[95:64] := e13
dst[127:96] := e12
dst[159:128] := e11
dst[191:160] := e10
dst[223:192] := e9
dst[255:224] := e8
dst[287:256] := e7
dst[319:288] := e6
dst[351:320] := e5
dst[383:352] := e4
dst[415:384] := e3
dst[447:416] := e2
dst[479:448] := e1
dst[511:480] := e0
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr_ps'. Requires AVX512F.

func M512Setzero ¶

func M512Setzero() (dst x86.M512)

M512Setzero: Return vector of type __m512 with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero'. Requires AVX512F.

func M512SetzeroEpi32 ¶

func M512SetzeroEpi32() (dst x86.M512i)

M512SetzeroEpi32: Return vector of type __m512i with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_epi32'. Requires AVX512F.

func M512SetzeroPd ¶

func M512SetzeroPd() (dst x86.M512d)

M512SetzeroPd: Return vector of type __m512d with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_pd'. Requires AVX512F.

func M512SetzeroPs ¶

func M512SetzeroPs() (dst x86.M512)

M512SetzeroPs: Return vector of type __m512 with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_ps'. Requires AVX512F.

func M512SetzeroSi512 ¶

func M512SetzeroSi512() (dst x86.M512i)

M512SetzeroSi512: Return vector of type __m512i with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_si512'. Requires AVX512F.

func M512ShuffleF32x4 ¶

func M512ShuffleF32x4(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512ShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm512_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512ShuffleF64x2 ¶

func M512ShuffleF64x2(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512ShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm512_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512ShuffleI32x4 ¶

func M512ShuffleI32x4(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512ShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm512_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512ShuffleI64x2 ¶

func M512ShuffleI64x2(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512ShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm512_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512ShufflePd ¶

func M512ShufflePd(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512ShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.

dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
dst[MAX:512] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm512_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512ShufflePs ¶

func M512ShufflePs(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512ShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(b[127:0], imm8[5:4])
dst[127:96] := SELECT4(b[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(b[255:128], imm8[5:4])
dst[255:224] := SELECT4(b[255:128], imm8[7:6])
dst[287:256] := SELECT4(a[383:256], imm8[1:0])
dst[319:288] := SELECT4(a[383:256], imm8[3:2])
dst[351:320] := SELECT4(b[383:256], imm8[5:4])
dst[383:352] := SELECT4(b[383:256], imm8[7:6])
dst[415:384] := SELECT4(a[511:384], imm8[1:0])
dst[447:416] := SELECT4(a[511:384], imm8[3:2])
dst[479:448] := SELECT4(b[511:384], imm8[5:4])
dst[511:480] := SELECT4(b[511:384], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm512_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512SinPd ¶

func M512SinPd(a x86.M512d) (dst x86.M512d)

M512SinPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SIN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sin_pd'. Requires AVX512F.

func M512SinPs ¶

func M512SinPs(a x86.M512) (dst x86.M512)

M512SinPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SIN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sin_ps'. Requires AVX512F.

func M512SincosPd ¶

func M512SincosPd(cos_res *x86.M512d, a x86.M512d) (dst x86.M512d)

M512SincosPd: Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SIN(a[i+63:i])
	cos_res[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sincos_pd'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512SincosPs ¶

func M512SincosPs(cos_res *x86.M512, a x86.M512) (dst x86.M512)

M512SincosPs: Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SIN(a[i+31:i])
	cos_res[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sincos_ps'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512SindPd ¶

func M512SindPd(a x86.M512d) (dst x86.M512d)

M512SindPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SIND(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sind_pd'. Requires AVX512F.

func M512SindPs ¶

func M512SindPs(a x86.M512) (dst x86.M512)

M512SindPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SIND(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sind_ps'. Requires AVX512F.

func M512SinhPd ¶

func M512SinhPd(a x86.M512d) (dst x86.M512d)

M512SinhPd: Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SINH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sinh_pd'. Requires AVX512F.

func M512SinhPs ¶

func M512SinhPs(a x86.M512) (dst x86.M512)

M512SinhPs: Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SINH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sinh_ps'. Requires AVX512F.

func M512SllEpi32 ¶

func M512SllEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF count[63:0] > 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm512_sll_epi32'. Requires AVX512F.

func M512SllEpi64 ¶

func M512SllEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_sll_epi64'. Requires AVX512F.

func M512SlliEpi64 ¶

func M512SlliEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512SllvEpi64 ¶

func M512SllvEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm512_sllv_epi64'. Requires AVX512F.

func M512SqrtPd ¶

func M512SqrtPd(a x86.M512d) (dst x86.M512d)

M512SqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_sqrt_pd'. Requires AVX512F.

func M512SqrtPs ¶

func M512SqrtPs(a x86.M512) (dst x86.M512)

M512SqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_sqrt_ps'. Requires AVX512F.

func M512SqrtRoundPd ¶

func M512SqrtRoundPd(a x86.M512d, rounding int) (dst x86.M512d)

M512SqrtRoundPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE.

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := SQRT(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_sqrt_round_pd'. Requires AVX512F.

func M512SqrtRoundPs ¶

func M512SqrtRoundPs(a x86.M512, rounding int) (dst x86.M512)

M512SqrtRoundPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE.

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := SQRT(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_sqrt_round_ps'. Requires AVX512F.

func M512SraEpi32 ¶

func M512SraEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF count[63:0] > 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm512_sra_epi32'. Requires AVX512F.

func M512SraEpi64 ¶

func M512SraEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_sra_epi64'. Requires AVX512F.

func M512SraiEpi64 ¶

func M512SraiEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512SravEpi64 ¶

func M512SravEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm512_srav_epi64'. Requires AVX512F.

func M512SrlEpi32 ¶

func M512SrlEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF count[63:0] > 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm512_srl_epi32'. Requires AVX512F.

func M512SrlEpi64 ¶

func M512SrlEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_srl_epi64'. Requires AVX512F.

func M512SrliEpi64 ¶

func M512SrliEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512SrlvEpi64 ¶

func M512SrlvEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm512_srlv_epi64'. Requires AVX512F.

func M512SubEpi64 ¶

func M512SubEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm512_sub_epi64'. Requires AVX512F.

func M512SvmlRoundPd ¶

func M512SvmlRoundPd(a x86.M512d) (dst x86.M512d)

M512SvmlRoundPd: Round the packed double-precision (64-bit) floating-point elements in 'a' to the nearest integer value, and store the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_svml_round_pd'. Requires AVX512F.

func M512TanPd ¶

func M512TanPd(a x86.M512d) (dst x86.M512d)

M512TanPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := TAN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tan_pd'. Requires AVX512F.

func M512TanPs ¶

func M512TanPs(a x86.M512) (dst x86.M512)

M512TanPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := TAN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tan_ps'. Requires AVX512F.

func M512TandPd ¶

func M512TandPd(a x86.M512d) (dst x86.M512d)

M512TandPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := TAND(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tand_pd'. Requires AVX512F.

func M512TandPs ¶

func M512TandPs(a x86.M512) (dst x86.M512)

M512TandPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := TAND(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tand_ps'. Requires AVX512F.

func M512TanhPd ¶

func M512TanhPd(a x86.M512d) (dst x86.M512d)

M512TanhPd: Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := TANH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tanh_pd'. Requires AVX512F.

func M512TanhPs ¶

func M512TanhPs(a x86.M512) (dst x86.M512)

M512TanhPs: Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := TANH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tanh_ps'. Requires AVX512F.

func M512TernarylogicEpi32 ¶

func M512TernarylogicEpi32(a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)

M512TernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 15
	i := j*32
	FOR h := 0 to 31
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm512_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512TernarylogicEpi64 ¶

func M512TernarylogicEpi64(a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)

M512TernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 7
	i := j*64
	FOR h := 0 to 63
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm512_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512TestEpi64Mask ¶

func M512TestEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512TestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm512_test_epi64_mask'. Requires AVX512F.

func M512TestnEpi32Mask ¶

func M512TestnEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512TestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 15
	i := j*32
	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm512_testn_epi32_mask'. Requires AVX512F.

func M512TestnEpi64Mask ¶

func M512TestnEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512TestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 7
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm512_testn_epi64_mask'. Requires AVX512F.

func M512TruncPd ¶

func M512TruncPd(a x86.M512d) (dst x86.M512d)

M512TruncPd: Truncate the packed double-precision (64-bit) floating-point elements in 'a', and store the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := TRUNCATE(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_trunc_pd'. Requires AVX512F.

func M512TruncPs ¶

func M512TruncPs(a x86.M512) (dst x86.M512)

M512TruncPs: Truncate the packed single-precision (32-bit) floating-point elements in 'a', and store the results as packed single-precision floating-point elements in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := TRUNCATE(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_trunc_ps'. Requires AVX512F.

func M512Undefined ¶

func M512Undefined() (dst x86.M512)

M512Undefined: Return vector of type __m512 with undefined elements.

Instruction: ”. Intrinsic: '_mm512_undefined'. Requires AVX512F.

func M512UndefinedEpi32 ¶

func M512UndefinedEpi32() (dst x86.M512i)

M512UndefinedEpi32: Return vector of type __m512i with undefined elements.

Instruction: ”. Intrinsic: '_mm512_undefined_epi32'. Requires AVX512F.

func M512UndefinedPd ¶

func M512UndefinedPd() (dst x86.M512d)

M512UndefinedPd: Return vector of type __m512d with undefined elements.

Instruction: ”. Intrinsic: '_mm512_undefined_pd'. Requires AVX512F.

func M512UndefinedPs ¶

func M512UndefinedPs() (dst x86.M512)

M512UndefinedPs: Return vector of type __m512 with undefined elements.

Instruction: ”. Intrinsic: '_mm512_undefined_ps'. Requires AVX512F.

func M512UnpackhiEpi32 ¶

func M512UnpackhiEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm512_unpackhi_epi32'. Requires AVX512F.

func M512UnpackhiEpi64 ¶

func M512UnpackhiEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm512_unpackhi_epi64'. Requires AVX512F.

func M512UnpackhiPd ¶

func M512UnpackhiPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512UnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm512_unpackhi_pd'. Requires AVX512F.

func M512UnpackhiPs ¶

func M512UnpackhiPs(a x86.M512, b x86.M512) (dst x86.M512)

M512UnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm512_unpackhi_ps'. Requires AVX512F.

func M512UnpackloEpi32 ¶

func M512UnpackloEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm512_unpacklo_epi32'. Requires AVX512F.

func M512UnpackloEpi64 ¶

func M512UnpackloEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm512_unpacklo_epi64'. Requires AVX512F.

func M512UnpackloPd ¶

func M512UnpackloPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512UnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm512_unpacklo_pd'. Requires AVX512F.

func M512UnpackloPs ¶

func M512UnpackloPs(a x86.M512, b x86.M512) (dst x86.M512)

M512UnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm512_unpacklo_ps'. Requires AVX512F.

func Mask2Permutex2varEpi32 ¶

func Mask2Permutex2varEpi32(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)

Mask2Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2D'. Intrinsic: '_mm_mask2_permutex2var_epi32'. Requires AVX512F.

func Mask2Permutex2varEpi64 ¶

func Mask2Permutex2varEpi64(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)

Mask2Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2Q'. Intrinsic: '_mm_mask2_permutex2var_epi64'. Requires AVX512F.

func Mask2Permutex2varPd ¶

func Mask2Permutex2varPd(a x86.M128d, idx x86.M128i, k x86.Mmask8, b x86.M128d) (dst x86.M128d)

Mask2Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set)

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PD'. Intrinsic: '_mm_mask2_permutex2var_pd'. Requires AVX512F.

func Mask2Permutex2varPs ¶

func Mask2Permutex2varPs(a x86.M128, idx x86.M128i, k x86.Mmask8, b x86.M128) (dst x86.M128)

Mask2Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PS'. Intrinsic: '_mm_mask2_permutex2var_ps'. Requires AVX512F.

func Mask3FmaddPd ¶

func Mask3FmaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm_mask3_fmadd_pd'. Requires AVX512F.

func Mask3FmaddPs ¶

func Mask3FmaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm_mask3_fmadd_ps'. Requires AVX512F.

func Mask3FmaddRoundSd ¶

func Mask3FmaddRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)

Mask3FmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := c[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask3_fmadd_round_sd'. Requires AVX512F.

func Mask3FmaddRoundSs ¶

func Mask3FmaddRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)

Mask3FmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := c[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask3_fmadd_round_ss'. Requires AVX512F.

func Mask3FmaddSd ¶

func Mask3FmaddSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask3_fmadd_sd'. Requires AVX512F.

func Mask3FmaddSs ¶

func Mask3FmaddSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask3_fmadd_ss'. Requires AVX512F.

func Mask3FmaddsubPd ¶

func Mask3FmaddsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm_mask3_fmaddsub_pd'. Requires AVX512F.

func Mask3FmaddsubPs ¶

func Mask3FmaddsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm_mask3_fmaddsub_ps'. Requires AVX512F.

func Mask3FmsubPd ¶

func Mask3FmsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm_mask3_fmsub_pd'. Requires AVX512F.

func Mask3FmsubPs ¶

func Mask3FmsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm_mask3_fmsub_ps'. Requires AVX512F.

func Mask3FmsubRoundSd ¶

func Mask3FmsubRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)

Mask3FmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := c[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask3_fmsub_round_sd'. Requires AVX512F.

func Mask3FmsubRoundSs ¶

func Mask3FmsubRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)

Mask3FmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := c[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask3_fmsub_round_ss'. Requires AVX512F.

func Mask3FmsubSd ¶

func Mask3FmsubSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask3_fmsub_sd'. Requires AVX512F.

func Mask3FmsubSs ¶

func Mask3FmsubSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask3_fmsub_ss'. Requires AVX512F.

func Mask3FmsubaddPd ¶

func Mask3FmsubaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm_mask3_fmsubadd_pd'. Requires AVX512F.

func Mask3FmsubaddPs ¶

func Mask3FmsubaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm_mask3_fmsubadd_ps'. Requires AVX512F.

func Mask3FnmaddPd ¶

func Mask3FnmaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm_mask3_fnmadd_pd'. Requires AVX512F.

func Mask3FnmaddPs ¶

func Mask3FnmaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm_mask3_fnmadd_ps'. Requires AVX512F.

func Mask3FnmaddRoundSd ¶

func Mask3FnmaddRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)

Mask3FnmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := c[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask3_fnmadd_round_sd'. Requires AVX512F.

func Mask3FnmaddRoundSs ¶

func Mask3FnmaddRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)

Mask3FnmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := c[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask3_fnmadd_round_ss'. Requires AVX512F.

func Mask3FnmaddSd ¶

func Mask3FnmaddSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FnmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask3_fnmadd_sd'. Requires AVX512F.

func Mask3FnmaddSs ¶

func Mask3FnmaddSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FnmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask3_fnmadd_ss'. Requires AVX512F.

func Mask3FnmsubPd ¶

func Mask3FnmsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm_mask3_fnmsub_pd'. Requires AVX512F.

func Mask3FnmsubPs ¶

func Mask3FnmsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm_mask3_fnmsub_ps'. Requires AVX512F.

func Mask3FnmsubRoundSd ¶

func Mask3FnmsubRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)

Mask3FnmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := c[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask3_fnmsub_round_sd'. Requires AVX512F.

func Mask3FnmsubRoundSs ¶

func Mask3FnmsubRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)

Mask3FnmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', subtract the lower element in 'c' from the negated intermediate result, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := c[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask3_fnmsub_round_ss'. Requires AVX512F.

func Mask3FnmsubSd ¶

func Mask3FnmsubSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FnmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask3_fnmsub_sd'. Requires AVX512F.

func Mask3FnmsubSs ¶

func Mask3FnmsubSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FnmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask3_fnmsub_ss'. Requires AVX512F.

func MaskAbsEpi32 ¶

func MaskAbsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm_mask_abs_epi32'. Requires AVX512F.

func MaskAbsEpi64 ¶

func MaskAbsEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm_mask_abs_epi64'. Requires AVX512F.

func MaskAddEpi32 ¶

func MaskAddEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm_mask_add_epi32'. Requires AVX512F.

func MaskAddEpi64 ¶

func MaskAddEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm_mask_add_epi64'. Requires AVX512F.

func MaskAddRoundSd ¶

func MaskAddRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskAddRoundSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] + b[63:0]
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VADDSD'. Intrinsic: '_mm_mask_add_round_sd'. Requires AVX512F.

func MaskAddRoundSs ¶

func MaskAddRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskAddRoundSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] + b[31:0]
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VADDSS'. Intrinsic: '_mm_mask_add_round_ss'. Requires AVX512F.

func MaskAddSd ¶

func MaskAddSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskAddSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] + b[63:0]
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VADDSD'. Intrinsic: '_mm_mask_add_sd'. Requires AVX512F.

func MaskAddSs ¶

func MaskAddSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskAddSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] + b[31:0]
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VADDSS'. Intrinsic: '_mm_mask_add_ss'. Requires AVX512F.

func MaskAndEpi32 ¶

func MaskAndEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm_mask_and_epi32'. Requires AVX512F.

func MaskAndEpi64 ¶

func MaskAndEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm_mask_and_epi64'. Requires AVX512F.

func MaskAndnotEpi32 ¶

func MaskAndnotEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm_mask_andnot_epi32'. Requires AVX512F.

func MaskAndnotEpi64 ¶

func MaskAndnotEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm_mask_andnot_epi64'. Requires AVX512F.

func MaskBlendEpi32 ¶

func MaskBlendEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskBlendEpi32: Blend packed 32-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBLENDMD'. Intrinsic: '_mm_mask_blend_epi32'. Requires AVX512F.

func MaskBlendEpi64 ¶

func MaskBlendEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskBlendEpi64: Blend packed 64-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBLENDMQ'. Intrinsic: '_mm_mask_blend_epi64'. Requires AVX512F.

func MaskBlendPd ¶

func MaskBlendPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskBlendPd: Blend packed double-precision (64-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBLENDMPD'. Intrinsic: '_mm_mask_blend_pd'. Requires AVX512F.

func MaskBlendPs ¶

func MaskBlendPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskBlendPs: Blend packed single-precision (32-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBLENDMPS'. Intrinsic: '_mm_mask_blend_ps'. Requires AVX512F.

func MaskBroadcastdEpi32 ¶

func MaskBroadcastdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_mask_broadcastd_epi32'. Requires AVX512F.

func MaskBroadcastqEpi64 ¶

func MaskBroadcastqEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_mask_broadcastq_epi64'. Requires AVX512F.

func MaskBroadcastssPs ¶

func MaskBroadcastssPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm_mask_broadcastss_ps'. Requires AVX512F.

func MaskCmpEpi32Mask ¶

func MaskCmpEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmp_epi32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpi64Mask ¶

func MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpu32Mask ¶

func MaskCmpEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmp_epu32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpu64Mask ¶

func MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpPdMask ¶

func MaskCmpPdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)

MaskCmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm_mask_cmp_pd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpPsMask ¶

func MaskCmpPsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)

MaskCmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm_mask_cmp_ps_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpRoundSdMask ¶

func MaskCmpRoundSdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, sae int) (dst x86.Mmask8)

MaskCmpRoundSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	IF k1[0]
		k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
	ELSE
		k[0] := 0
	FI
	k[MAX:1] := 0

Instruction: 'VCMPSD'. Intrinsic: '_mm_mask_cmp_round_sd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpRoundSsMask ¶

func MaskCmpRoundSsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, sae int) (dst x86.Mmask8)

MaskCmpRoundSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	IF k1[0]
		k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
	ELSE
		k[0] := 0
	FI
	k[MAX:1] := 0

Instruction: 'VCMPSS'. Intrinsic: '_mm_mask_cmp_round_ss_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpSdMask ¶

func MaskCmpSdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)

MaskCmpSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC

IF k1[0]
	k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
ELSE
	k[0] := 0
FI
k[MAX:1] := 0

Instruction: 'VCMPSD'. Intrinsic: '_mm_mask_cmp_sd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpSsMask ¶

func MaskCmpSsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)

MaskCmpSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC

IF k1[0]
	k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
ELSE
	k[0] := 0
FI
k[MAX:1] := 0

Instruction: 'VCMPSS'. Intrinsic: '_mm_mask_cmp_ss_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpeqEpi32Mask ¶

func MaskCmpeqEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpeq_epi32_mask'. Requires AVX512F.

func MaskCmpeqEpi64Mask ¶

func MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpeq_epi64_mask'. Requires AVX512F.

func MaskCmpeqEpu32Mask ¶

func MaskCmpeqEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpeq_epu32_mask'. Requires AVX512F.

func MaskCmpeqEpu64Mask ¶

func MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpeq_epu64_mask'. Requires AVX512F.

func MaskCmpgeEpi32Mask ¶

func MaskCmpgeEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpge_epi32_mask'. Requires AVX512F.

func MaskCmpgeEpi64Mask ¶

func MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpge_epi64_mask'. Requires AVX512F.

func MaskCmpgeEpu32Mask ¶

func MaskCmpgeEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpge_epu32_mask'. Requires AVX512F.

func MaskCmpgeEpu64Mask ¶

func MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpge_epu64_mask'. Requires AVX512F.

func MaskCmpgtEpi32Mask ¶

func MaskCmpgtEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpgt_epi32_mask'. Requires AVX512F.

func MaskCmpgtEpi64Mask ¶

func MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpgt_epi64_mask'. Requires AVX512F.

func MaskCmpgtEpu32Mask ¶

func MaskCmpgtEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpgt_epu32_mask'. Requires AVX512F.

func MaskCmpgtEpu64Mask ¶

func MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpgt_epu64_mask'. Requires AVX512F.

func MaskCmpleEpi32Mask ¶

func MaskCmpleEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmple_epi32_mask'. Requires AVX512F.

func MaskCmpleEpi64Mask ¶

func MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmple_epi64_mask'. Requires AVX512F.

func MaskCmpleEpu32Mask ¶

func MaskCmpleEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmple_epu32_mask'. Requires AVX512F.

func MaskCmpleEpu64Mask ¶

func MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmple_epu64_mask'. Requires AVX512F.

func MaskCmpltEpi32Mask ¶

func MaskCmpltEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmplt_epi32_mask'. Requires AVX512F.

func MaskCmpltEpi64Mask ¶

func MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmplt_epi64_mask'. Requires AVX512F.

func MaskCmpltEpu32Mask ¶

func MaskCmpltEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmplt_epu32_mask'. Requires AVX512F.

func MaskCmpltEpu64Mask ¶

func MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmplt_epu64_mask'. Requires AVX512F.

func MaskCmpneqEpi32Mask ¶

func MaskCmpneqEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpneq_epi32_mask'. Requires AVX512F.

func MaskCmpneqEpi64Mask ¶

func MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpneq_epi64_mask'. Requires AVX512F.

func MaskCmpneqEpu32Mask ¶

func MaskCmpneqEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpneq_epu32_mask'. Requires AVX512F.

func MaskCmpneqEpu64Mask ¶

func MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpneq_epu64_mask'. Requires AVX512F.

func MaskCompressEpi32 ¶

func MaskCompressEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm_mask_compress_epi32'. Requires AVX512F.

func MaskCompressEpi64 ¶

func MaskCompressEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm_mask_compress_epi64'. Requires AVX512F.

func MaskCompressPd ¶

func MaskCompressPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm_mask_compress_pd'. Requires AVX512F.

func MaskCompressPs ¶

func MaskCompressPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm_mask_compress_ps'. Requires AVX512F.

func MaskCvtRoundpsPh ¶

func MaskCvtRoundpsPh(src x86.M128i, k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)

MaskCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 3
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:64] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_mask_cvt_roundps_ph'. Requires AVX512F.

func MaskCvtRoundsdSs ¶

func MaskCvtRoundsdSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128d, rounding int) (dst x86.M128)

MaskCvtRoundsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := Convert_FP64_To_FP32(b[63:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:31]
		dst[MAX:64] := 0

Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_mask_cvt_roundsd_ss'. Requires AVX512F.

func MaskCvtRoundssSd ¶

func MaskCvtRoundssSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)

MaskCvtRoundssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := Convert_FP32_To_FP64(b[31:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:64] := 0

Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_mask_cvt_roundss_sd'. Requires AVX512F.

func MaskCvtepi16Epi32 ¶

func MaskCvtepi16Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	l := j*16
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm_mask_cvtepi16_epi32'. Requires AVX512F.

func MaskCvtepi16Epi64 ¶

func MaskCvtepi16Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi16Epi64: Sign extend packed 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm_mask_cvtepi16_epi64'. Requires AVX512F.

func MaskCvtepi32Epi16 ¶

func MaskCvtepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm_mask_cvtepi32_epi16'. Requires AVX512F.

func MaskCvtepi32Epi64 ¶

func MaskCvtepi32Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm_mask_cvtepi32_epi64'. Requires AVX512F.

func MaskCvtepi32Epi8 ¶

func MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm_mask_cvtepi32_epi8'. Requires AVX512F.

func MaskCvtepi32Pd ¶

func MaskCvtepi32Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := src[m+63:m]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm_mask_cvtepi32_pd'. Requires AVX512F.

func MaskCvtepi32Ps ¶

func MaskCvtepi32Ps(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm_mask_cvtepi32_ps'. Requires AVX512F.

func MaskCvtepi64Epi16 ¶

func MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm_mask_cvtepi64_epi16'. Requires AVX512F.

func MaskCvtepi64Epi32 ¶

func MaskCvtepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm_mask_cvtepi64_epi32'. Requires AVX512F.

func MaskCvtepi64Epi8 ¶

func MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm_mask_cvtepi64_epi8'. Requires AVX512F.

func MaskCvtepi8Epi32 ¶

func MaskCvtepi8Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi8Epi32: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm_mask_cvtepi8_epi32'. Requires AVX512F.

func MaskCvtepi8Epi64 ¶

func MaskCvtepi8Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi8Epi64: Sign extend packed 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm_mask_cvtepi8_epi64'. Requires AVX512F.

func MaskCvtepu16Epi32 ¶

func MaskCvtepu16Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm_mask_cvtepu16_epi32'. Requires AVX512F.

func MaskCvtepu16Epi64 ¶

func MaskCvtepu16Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm_mask_cvtepu16_epi64'. Requires AVX512F.

func MaskCvtepu32Epi64 ¶

func MaskCvtepu32Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm_mask_cvtepu32_epi64'. Requires AVX512F.

func MaskCvtepu32Pd ¶

func MaskCvtepu32Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm_mask_cvtepu32_pd'. Requires AVX512F.

func MaskCvtepu8Epi32 ¶

func MaskCvtepu8Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in the low 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm_mask_cvtepu8_epi32'. Requires AVX512F.

func MaskCvtepu8Epi64 ¶

func MaskCvtepu8Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm_mask_cvtepu8_epi64'. Requires AVX512F.

func MaskCvtpdEpi32 ¶

func MaskCvtpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm_mask_cvtpd_epi32'. Requires AVX512F.

func MaskCvtpdEpu32 ¶

func MaskCvtpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm_mask_cvtpd_epu32'. Requires AVX512F.

func MaskCvtpdPs ¶

func MaskCvtpdPs(src x86.M128, k x86.Mmask8, a x86.M128d) (dst x86.M128)

MaskCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm_mask_cvtpd_ps'. Requires AVX512F.

func MaskCvtphPs ¶

func MaskCvtphPs(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm_mask_cvtph_ps'. Requires AVX512F.

func MaskCvtpsEpi32 ¶

func MaskCvtpsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm_mask_cvtps_epi32'. Requires AVX512F.

func MaskCvtpsEpu32 ¶

func MaskCvtpsEpu32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm_mask_cvtps_epu32'. Requires AVX512F.

func MaskCvtpsPh ¶

func MaskCvtpsPh(src x86.M128i, k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)

MaskCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 3
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:64] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_mask_cvtps_ph'. Requires AVX512F.

func MaskCvtsdSs ¶

func MaskCvtsdSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128d) (dst x86.M128)

MaskCvtsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:31]
dst[MAX:64] := 0

Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_mask_cvtsd_ss'. Requires AVX512F.

func MaskCvtsepi32Epi16 ¶

func MaskCvtsepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm_mask_cvtsepi32_epi16'. Requires AVX512F.

func MaskCvtsepi32Epi8 ¶

func MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm_mask_cvtsepi32_epi8'. Requires AVX512F.

func MaskCvtsepi64Epi16 ¶

func MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm_mask_cvtsepi64_epi16'. Requires AVX512F.

func MaskCvtsepi64Epi32 ¶

func MaskCvtsepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm_mask_cvtsepi64_epi32'. Requires AVX512F.

func MaskCvtsepi64Epi8 ¶

func MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm_mask_cvtsepi64_epi8'. Requires AVX512F.

func MaskCvtssSd ¶

func MaskCvtssSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128) (dst x86.M128d)

MaskCvtssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:64] := 0

Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_mask_cvtss_sd'. Requires AVX512F.

func MaskCvttpdEpi32 ¶

func MaskCvttpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm_mask_cvttpd_epi32'. Requires AVX512F.

func MaskCvttpdEpu32 ¶

func MaskCvttpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm_mask_cvttpd_epu32'. Requires AVX512F.

func MaskCvttpsEpi32 ¶

func MaskCvttpsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm_mask_cvttps_epi32'. Requires AVX512F.

func MaskCvttpsEpu32 ¶

func MaskCvttpsEpu32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm_mask_cvttps_epu32'. Requires AVX512F.

func MaskCvtusepi32Epi16 ¶

func MaskCvtusepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm_mask_cvtusepi32_epi16'. Requires AVX512F.

func MaskCvtusepi32Epi8 ¶

func MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm_mask_cvtusepi32_epi8'. Requires AVX512F.

func MaskCvtusepi64Epi16 ¶

func MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm_mask_cvtusepi64_epi16'. Requires AVX512F.

func MaskCvtusepi64Epi32 ¶

func MaskCvtusepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm_mask_cvtusepi64_epi32'. Requires AVX512F.

func MaskCvtusepi64Epi8 ¶

func MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm_mask_cvtusepi64_epi8'. Requires AVX512F.

func MaskDivPd ¶

func MaskDivPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm_mask_div_pd'. Requires AVX512F.

func MaskDivPs ¶

func MaskDivPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm_mask_div_ps'. Requires AVX512F.

func MaskDivRoundSd ¶

func MaskDivRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskDivRoundSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] / b[63:0]
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VDIVSD'. Intrinsic: '_mm_mask_div_round_sd'. Requires AVX512F.

func MaskDivRoundSs ¶

func MaskDivRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskDivRoundSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] / b[31:0]
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VDIVSS'. Intrinsic: '_mm_mask_div_round_ss'. Requires AVX512F.

func MaskDivSd ¶

func MaskDivSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskDivSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] / b[63:0]
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VDIVSD'. Intrinsic: '_mm_mask_div_sd'. Requires AVX512F.

func MaskDivSs ¶

func MaskDivSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskDivSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] / b[31:0]
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VDIVSS'. Intrinsic: '_mm_mask_div_ss'. Requires AVX512F.

func MaskExpandEpi32 ¶

func MaskExpandEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm_mask_expand_epi32'. Requires AVX512F.

func MaskExpandEpi64 ¶

func MaskExpandEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm_mask_expand_epi64'. Requires AVX512F.

func MaskExpandPd ¶

func MaskExpandPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm_mask_expand_pd'. Requires AVX512F.

func MaskExpandPs ¶

func MaskExpandPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm_mask_expand_ps'. Requires AVX512F.

func MaskFixupimmPd ¶

func MaskFixupimmPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

MaskFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm_mask_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFixupimmPs ¶

func MaskFixupimmPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

MaskFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm_mask_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFixupimmRoundSd ¶

func MaskFixupimmRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)

MaskFixupimmRoundSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		IF k[0]
			dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
		ELSE
			dst[63:0] := a[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_mask_fixupimm_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFixupimmRoundSs ¶

func MaskFixupimmRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)

MaskFixupimmRoundSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		IF k[0]
			dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
		ELSE
			dst[31:0] := a[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_mask_fixupimm_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFixupimmSd ¶

func MaskFixupimmSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

MaskFixupimmSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

IF k[0]
	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
ELSE
	dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_mask_fixupimm_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFixupimmSs ¶

func MaskFixupimmSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

MaskFixupimmSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

IF k[0]
	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
ELSE
	dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_mask_fixupimm_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFmaddPd ¶

func MaskFmaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm_mask_fmadd_pd'. Requires AVX512F.

func MaskFmaddPs ¶

func MaskFmaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm_mask_fmadd_ps'. Requires AVX512F.

func MaskFmaddRoundSd ¶

func MaskFmaddRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskFmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := a[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask_fmadd_round_sd'. Requires AVX512F.

func MaskFmaddRoundSs ¶

func MaskFmaddRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskFmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := a[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask_fmadd_round_ss'. Requires AVX512F.

func MaskFmaddSd ¶

func MaskFmaddSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask_fmadd_sd'. Requires AVX512F.

func MaskFmaddSs ¶

func MaskFmaddSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask_fmadd_ss'. Requires AVX512F.

func MaskFmaddsubPd ¶

func MaskFmaddsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm_mask_fmaddsub_pd'. Requires AVX512F.

func MaskFmaddsubPs ¶

func MaskFmaddsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm_mask_fmaddsub_ps'. Requires AVX512F.

func MaskFmsubPd ¶

func MaskFmsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm_mask_fmsub_pd'. Requires AVX512F.

func MaskFmsubPs ¶

func MaskFmsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm_mask_fmsub_ps'. Requires AVX512F.

func MaskFmsubRoundSd ¶

func MaskFmsubRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskFmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := a[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask_fmsub_round_sd'. Requires AVX512F.

func MaskFmsubRoundSs ¶

func MaskFmsubRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskFmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := a[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask_fmsub_round_ss'. Requires AVX512F.

func MaskFmsubSd ¶

func MaskFmsubSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask_fmsub_sd'. Requires AVX512F.

func MaskFmsubSs ¶

func MaskFmsubSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask_fmsub_ss'. Requires AVX512F.

func MaskFmsubaddPd ¶

func MaskFmsubaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm_mask_fmsubadd_pd'. Requires AVX512F.

func MaskFmsubaddPs ¶

func MaskFmsubaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm_mask_fmsubadd_ps'. Requires AVX512F.

func MaskFnmaddPd ¶

func MaskFnmaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm_mask_fnmadd_pd'. Requires AVX512F.

func MaskFnmaddPs ¶

func MaskFnmaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm_mask_fnmadd_ps'. Requires AVX512F.

func MaskFnmaddRoundSd ¶

func MaskFnmaddRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskFnmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := a[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask_fnmadd_round_sd'. Requires AVX512F.

func MaskFnmaddRoundSs ¶

func MaskFnmaddRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskFnmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := a[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask_fnmadd_round_ss'. Requires AVX512F.

func MaskFnmaddSd ¶

func MaskFnmaddSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFnmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask_fnmadd_sd'. Requires AVX512F.

func MaskFnmaddSs ¶

func MaskFnmaddSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFnmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask_fnmadd_ss'. Requires AVX512F.

func MaskFnmsubPd ¶

func MaskFnmsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm_mask_fnmsub_pd'. Requires AVX512F.

func MaskFnmsubPs ¶

func MaskFnmsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm_mask_fnmsub_ps'. Requires AVX512F.

func MaskFnmsubRoundSd ¶

func MaskFnmsubRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskFnmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := a[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask_fnmsub_round_sd'. Requires AVX512F.

func MaskFnmsubRoundSs ¶

func MaskFnmsubRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskFnmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := a[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask_fnmsub_round_ss'. Requires AVX512F.

func MaskFnmsubSd ¶

func MaskFnmsubSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFnmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask_fnmsub_sd'. Requires AVX512F.

func MaskFnmsubSs ¶

func MaskFnmsubSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFnmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask_fnmsub_ss'. Requires AVX512F.

func MaskGetexpPd ¶

func MaskGetexpPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm_mask_getexp_pd'. Requires AVX512F.

func MaskGetexpPs ¶

func MaskGetexpPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm_mask_getexp_ps'. Requires AVX512F.

func MaskGetexpRoundSd ¶

func MaskGetexpRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskGetexpRoundSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := ConvertExpFP64(b[63:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_mask_getexp_round_sd'. Requires AVX512F.

func MaskGetexpRoundSs ¶

func MaskGetexpRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskGetexpRoundSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := ConvertExpFP32(b[31:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_mask_getexp_round_ss'. Requires AVX512F.

func MaskGetexpSd ¶

func MaskGetexpSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskGetexpSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

IF k[0]
	dst[63:0] := ConvertExpFP64(b[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_mask_getexp_sd'. Requires AVX512F.

func MaskGetexpSs ¶

func MaskGetexpSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskGetexpSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

IF k[0]
	dst[31:0] := ConvertExpFP32(b[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_mask_getexp_ss'. Requires AVX512F.

func MaskGetmantPd ¶

func MaskGetmantPd(src x86.M128d, k x86.Mmask8, a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

MaskGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 1
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm_mask_getmant_pd'. Requires AVX512F.

func MaskGetmantPs ¶

func MaskGetmantPs(src x86.M128, k x86.Mmask8, a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

MaskGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm_mask_getmant_ps'. Requires AVX512F.

func MaskGetmantRoundSd ¶

func MaskGetmantRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128d)

MaskGetmantRoundSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_mask_getmant_round_sd'. Requires AVX512F.

func MaskGetmantRoundSs ¶

func MaskGetmantRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128)

MaskGetmantRoundSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_mask_getmant_round_ss'. Requires AVX512F.

func MaskGetmantSd ¶

func MaskGetmantSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

MaskGetmantSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		IF k[0]
			dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_mask_getmant_sd'. Requires AVX512F.

func MaskGetmantSs ¶

func MaskGetmantSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

MaskGetmantSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		IF k[0]
			dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_mask_getmant_ss'. Requires AVX512F.

func MaskLoadSd ¶

func MaskLoadSd(src x86.M128d, k x86.Mmask8, mem_addr *float64) (dst x86.M128d)

MaskLoadSd: Load a double-precision (64-bit) floating-point element from memory into the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and set the upper element of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

IF k[0]
	dst[63:0] := MEM[mem_addr+63:mem_addr]
ELSE
	dst[63:0] := src[63:0]
FI
dst[MAX:64] := 0

Instruction: 'VMOVSD'. Intrinsic: '_mm_mask_load_sd'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskLoadSs ¶

func MaskLoadSs(src x86.M128, k x86.Mmask8, mem_addr *float32) (dst x86.M128)

MaskLoadSs: Load a single-precision (32-bit) floating-point element from memory into the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and set the upper elements of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

IF k[0]
	dst[31:0] := MEM[mem_addr+31:mem_addr]
ELSE
	dst[31:0] := src[31:0]
FI
dst[MAX:32] := 0

Instruction: 'VMOVSS'. Intrinsic: '_mm_mask_load_ss'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskMaxEpi32 ¶

func MaskMaxEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm_mask_max_epi32'. Requires AVX512F.

func MaskMaxEpi64 ¶

func MaskMaxEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm_mask_max_epi64'. Requires AVX512F.

func MaskMaxEpu32 ¶

func MaskMaxEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm_mask_max_epu32'. Requires AVX512F.

func MaskMaxEpu64 ¶

func MaskMaxEpu64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm_mask_max_epu64'. Requires AVX512F.

func MaskMaxPd ¶

func MaskMaxPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm_mask_max_pd'. Requires AVX512F.

func MaskMaxPs ¶

func MaskMaxPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm_mask_max_ps'. Requires AVX512F.

func MaskMaxRoundSd ¶

func MaskMaxRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MaskMaxRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[63:0] := MAX(a[63:0], b[63:0])
	ELSE
		dst[63:0] := src[63:0]
	FI
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMAXSD'. Intrinsic: '_mm_mask_max_round_sd'. Requires AVX512F.

func MaskMaxRoundSs ¶

func MaskMaxRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)

MaskMaxRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[31:0] := MAX(a[31:0], b[31:0])
	ELSE
		dst[31:0] := src[31:0]
	FI
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMAXSS'. Intrinsic: '_mm_mask_max_round_ss'. Requires AVX512F.

func MaskMaxSd ¶

func MaskMaxSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMaxSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := MAX(a[63:0], b[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMAXSD'. Intrinsic: '_mm_mask_max_sd'. Requires AVX512F.

func MaskMaxSs ¶

func MaskMaxSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMaxSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[31:0] := MAX(a[31:0], b[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMAXSS'. Intrinsic: '_mm_mask_max_ss'. Requires AVX512F.

func MaskMinEpi32 ¶

func MaskMinEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm_mask_min_epi32'. Requires AVX512F.

func MaskMinEpi64 ¶

func MaskMinEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm_mask_min_epi64'. Requires AVX512F.

func MaskMinEpu32 ¶

func MaskMinEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm_mask_min_epu32'. Requires AVX512F.

func MaskMinEpu64 ¶

func MaskMinEpu64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm_mask_min_epu64'. Requires AVX512F.

func MaskMinPd ¶

func MaskMinPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm_mask_min_pd'. Requires AVX512F.

func MaskMinPs ¶

func MaskMinPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm_mask_min_ps'. Requires AVX512F.

func MaskMinRoundSd ¶

func MaskMinRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MaskMinRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[63:0] := MIN(a[63:0], b[63:0])
	ELSE
		dst[63:0] := src[63:0]
	FI
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMINSD'. Intrinsic: '_mm_mask_min_round_sd'. Requires AVX512F.

func MaskMinRoundSs ¶

func MaskMinRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)

MaskMinRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[31:0] := MIN(a[31:0], b[31:0])
	ELSE
		dst[31:0] := src[31:0]
	FI
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMINSS'. Intrinsic: '_mm_mask_min_round_ss'. Requires AVX512F.

func MaskMinSd ¶

func MaskMinSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMinSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := MIN(a[63:0], b[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMINSD'. Intrinsic: '_mm_mask_min_sd'. Requires AVX512F.

func MaskMinSs ¶

func MaskMinSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMinSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[31:0] := MIN(a[31:0], b[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMINSS'. Intrinsic: '_mm_mask_min_ss'. Requires AVX512F.

func MaskMovEpi32 ¶

func MaskMovEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskMovEpi32: Move packed 32-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm_mask_mov_epi32'. Requires AVX512F.

func MaskMovEpi64 ¶

func MaskMovEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskMovEpi64: Move packed 64-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm_mask_mov_epi64'. Requires AVX512F.

func MaskMovPd ¶

func MaskMovPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm_mask_mov_pd'. Requires AVX512F.

func MaskMovPs ¶

func MaskMovPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm_mask_mov_ps'. Requires AVX512F.

func MaskMoveSd ¶

func MaskMoveSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMoveSd: Move the lower double-precision (64-bit) floating-point element from 'b' to the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := b[63:0]
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMOVSD'. Intrinsic: '_mm_mask_move_sd'. Requires AVX512F.

func MaskMoveSs ¶

func MaskMoveSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMoveSs: Move the lower single-precision (32-bit) floating-point element from 'b' to the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := b[31:0]
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMOVSS'. Intrinsic: '_mm_mask_move_ss'. Requires AVX512F.

func MaskMovedupPd ¶

func MaskMovedupPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm_mask_movedup_pd'. Requires AVX512F.

func MaskMovehdupPs ¶

func MaskMovehdupPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm_mask_movehdup_ps'. Requires AVX512F.

func MaskMoveldupPs ¶

func MaskMoveldupPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm_mask_moveldup_ps'. Requires AVX512F.

func MaskMulEpi32 ¶

func MaskMulEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm_mask_mul_epi32'. Requires AVX512F.

func MaskMulEpu32 ¶

func MaskMulEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm_mask_mul_epu32'. Requires AVX512F.

func MaskMulPd ¶

func MaskMulPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm_mask_mul_pd'. Requires AVX512F.

func MaskMulPs ¶

func MaskMulPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). RM.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm_mask_mul_ps'. Requires AVX512F.

func MaskMulRoundSd ¶

func MaskMulRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskMulRoundSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] * b[63:0]
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VMULSD'. Intrinsic: '_mm_mask_mul_round_sd'. Requires AVX512F.

func MaskMulRoundSs ¶

func MaskMulRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskMulRoundSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] * b[31:0]
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VMULSS'. Intrinsic: '_mm_mask_mul_round_ss'. Requires AVX512F.

func MaskMulSd ¶

func MaskMulSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMulSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] * b[63:0]
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMULSD'. Intrinsic: '_mm_mask_mul_sd'. Requires AVX512F.

func MaskMulSs ¶

func MaskMulSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMulSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] * b[31:0]
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMULSS'. Intrinsic: '_mm_mask_mul_ss'. Requires AVX512F.

func MaskMulloEpi32 ¶

func MaskMulloEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm_mask_mullo_epi32'. Requires AVX512F.

func MaskOrEpi32 ¶

func MaskOrEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPORD'. Intrinsic: '_mm_mask_or_epi32'. Requires AVX512F.

func MaskOrEpi64 ¶

func MaskOrEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm_mask_or_epi64'. Requires AVX512F.

func MaskPermutePd ¶

func MaskPermutePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm_mask_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskPermutePs ¶

func MaskPermutePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm_mask_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskPermutevarPd ¶

func MaskPermutevarPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128i) (dst x86.M128d)

MaskPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm_mask_permutevar_pd'. Requires AVX512F.

func MaskPermutevarPs ¶

func MaskPermutevarPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128i) (dst x86.M128)

MaskPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm_mask_permutevar_ps'. Requires AVX512F.

func MaskPermutex2varEpi32 ¶

func MaskPermutex2varEpi32(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMT2D'. Intrinsic: '_mm_mask_permutex2var_epi32'. Requires AVX512F.

func MaskPermutex2varEpi64 ¶

func MaskPermutex2varEpi64(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMT2Q'. Intrinsic: '_mm_mask_permutex2var_epi64'. Requires AVX512F.

func MaskPermutex2varPd ¶

func MaskPermutex2varPd(a x86.M128d, k x86.Mmask8, idx x86.M128i, b x86.M128d) (dst x86.M128d)

MaskPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMT2PD'. Intrinsic: '_mm_mask_permutex2var_pd'. Requires AVX512F.

func MaskPermutex2varPs ¶

func MaskPermutex2varPs(a x86.M128, k x86.Mmask8, idx x86.M128i, b x86.M128) (dst x86.M128)

MaskPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMT2PS'. Intrinsic: '_mm_mask_permutex2var_ps'. Requires AVX512F.

func MaskRcp14Pd ¶

func MaskRcp14Pd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm_mask_rcp14_pd'. Requires AVX512F.

func MaskRcp14Ps ¶

func MaskRcp14Ps(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm_mask_rcp14_ps'. Requires AVX512F.

func MaskRcp14Sd ¶

func MaskRcp14Sd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskRcp14Sd: Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[63:0] := APPROXIMATE(1.0/b[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRCP14SD'. Intrinsic: '_mm_mask_rcp14_sd'. Requires AVX512F.

func MaskRcp14Ss ¶

func MaskRcp14Ss(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskRcp14Ss: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[31:0] := APPROXIMATE(1.0/b[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRCP14SS'. Intrinsic: '_mm_mask_rcp14_ss'. Requires AVX512F.

func MaskRolEpi32 ¶

func MaskRolEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm_mask_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRolEpi64 ¶

func MaskRolEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm_mask_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRolvEpi32 ¶

func MaskRolvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm_mask_rolv_epi32'. Requires AVX512F.

func MaskRolvEpi64 ¶

func MaskRolvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm_mask_rolv_epi64'. Requires AVX512F.

func MaskRorEpi32 ¶

func MaskRorEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm_mask_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRorEpi64 ¶

func MaskRorEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm_mask_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRorvEpi32 ¶

func MaskRorvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm_mask_rorv_epi32'. Requires AVX512F.

func MaskRorvEpi64 ¶

func MaskRorvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm_mask_rorv_epi64'. Requires AVX512F.

func MaskRoundscalePd ¶

func MaskRoundscalePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm_mask_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRoundscalePs ¶

func MaskRoundscalePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm_mask_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRoundscaleRoundSd ¶

func MaskRoundscaleRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskRoundscaleRoundSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		IF k[0]
			dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_mask_roundscale_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRoundscaleRoundSs ¶

func MaskRoundscaleRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskRoundscaleRoundSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		IF k[0]
			dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_mask_roundscale_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRoundscaleSd ¶

func MaskRoundscaleSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskRoundscaleSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

IF k[0]
	dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_mask_roundscale_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRoundscaleSs ¶

func MaskRoundscaleSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskRoundscaleSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

IF k[0]
	dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_mask_roundscale_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRsqrt14Pd ¶

func MaskRsqrt14Pd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm_mask_rsqrt14_pd'. Requires AVX512F.

func MaskRsqrt14Ps ¶

func MaskRsqrt14Ps(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm_mask_rsqrt14_ps'. Requires AVX512F.

func MaskRsqrt14Sd ¶

func MaskRsqrt14Sd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskRsqrt14Sd: Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SD'. Intrinsic: '_mm_mask_rsqrt14_sd'. Requires AVX512F.

func MaskRsqrt14Ss ¶

func MaskRsqrt14Ss(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskRsqrt14Ss: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SS'. Intrinsic: '_mm_mask_rsqrt14_ss'. Requires AVX512F.

func MaskScalefPd ¶

func MaskScalefPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm_mask_scalef_pd'. Requires AVX512F.

func MaskScalefPs ¶

func MaskScalefPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm_mask_scalef_ps'. Requires AVX512F.

func MaskScalefRoundSd ¶

func MaskScalefRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskScalefRoundSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		IF k[0]
			dst[63:0] := SCALE(a[63:0], b[63:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_mask_scalef_round_sd'. Requires AVX512F.

func MaskScalefRoundSs ¶

func MaskScalefRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskScalefRoundSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[63:0]
		}

		IF k[0]
			dst[31:0] := SCALE(a[31:0], b[31:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_mask_scalef_round_ss'. Requires AVX512F.

func MaskScalefSd ¶

func MaskScalefSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskScalefSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

IF k[0]
	dst[63:0] := SCALE(a[63:0], b[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_mask_scalef_sd'. Requires AVX512F.

func MaskScalefSs ¶

func MaskScalefSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskScalefSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[63:0]
}

IF k[0]
	dst[31:0] := SCALE(a[31:0], b[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_mask_scalef_ss'. Requires AVX512F.

func MaskSet1Epi32 ¶

func MaskSet1Epi32(src x86.M128i, k x86.Mmask8, a int) (dst x86.M128i)

MaskSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_mask_set1_epi32'. Requires AVX512F.

func MaskSet1Epi64 ¶

func MaskSet1Epi64(src x86.M128i, k x86.Mmask8, a int64) (dst x86.M128i)

MaskSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_mask_set1_epi64'. Requires AVX512F.

func MaskShuffleEpi32 ¶

func MaskShuffleEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskShuffleEpi32: Shuffle 32-bit integers in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm_mask_shuffle_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskShufflePd ¶

func MaskShufflePd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskShufflePd: Shuffle double-precision (64-bit) floating-point elements using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm_mask_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskShufflePs ¶

func MaskShufflePs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm_mask_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSllEpi32 ¶

func MaskSllEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm_mask_sll_epi32'. Requires AVX512F.

func MaskSllEpi64 ¶

func MaskSllEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm_mask_sll_epi64'. Requires AVX512F.

func MaskSlliEpi32 ¶

func MaskSlliEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm_mask_slli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSlliEpi64 ¶

func MaskSlliEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm_mask_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSllvEpi32 ¶

func MaskSllvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm_mask_sllv_epi32'. Requires AVX512F.

func MaskSllvEpi64 ¶

func MaskSllvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm_mask_sllv_epi64'. Requires AVX512F.

func MaskSqrtPd ¶

func MaskSqrtPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm_mask_sqrt_pd'. Requires AVX512F.

func MaskSqrtPs ¶

func MaskSqrtPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm_mask_sqrt_ps'. Requires AVX512F.

func MaskSqrtRoundSd ¶

func MaskSqrtRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskSqrtRoundSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := SQRT(a[63:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSQRTSD'. Intrinsic: '_mm_mask_sqrt_round_sd'. Requires AVX512F.

func MaskSqrtRoundSs ¶

func MaskSqrtRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskSqrtRoundSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := SQRT(a[31:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSQRTSS'. Intrinsic: '_mm_mask_sqrt_round_ss'. Requires AVX512F.

func MaskSqrtSd ¶

func MaskSqrtSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskSqrtSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := SQRT(a[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VSQRTSD'. Intrinsic: '_mm_mask_sqrt_sd'. Requires AVX512F.

func MaskSqrtSs ¶

func MaskSqrtSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskSqrtSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := SQRT(a[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VSQRTSS'. Intrinsic: '_mm_mask_sqrt_ss'. Requires AVX512F.

func MaskSraEpi32 ¶

func MaskSraEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm_mask_sra_epi32'. Requires AVX512F.

func MaskSraEpi64 ¶

func MaskSraEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_mask_sra_epi64'. Requires AVX512F.

func MaskSraiEpi32 ¶

func MaskSraiEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm_mask_srai_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSraiEpi64 ¶

func MaskSraiEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_mask_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSravEpi32 ¶

func MaskSravEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm_mask_srav_epi32'. Requires AVX512F.

func MaskSravEpi64 ¶

func MaskSravEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm_mask_srav_epi64'. Requires AVX512F.

func MaskSrlEpi32 ¶

func MaskSrlEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm_mask_srl_epi32'. Requires AVX512F.

func MaskSrlEpi64 ¶

func MaskSrlEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm_mask_srl_epi64'. Requires AVX512F.

func MaskSrliEpi32 ¶

func MaskSrliEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm_mask_srli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSrliEpi64 ¶

func MaskSrliEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm_mask_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSrlvEpi32 ¶

func MaskSrlvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm_mask_srlv_epi32'. Requires AVX512F.

func MaskSrlvEpi64 ¶

func MaskSrlvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm_mask_srlv_epi64'. Requires AVX512F.

func MaskStoreSd ¶

func MaskStoreSd(mem_addr *float64, k x86.Mmask8, a x86.M128d)

MaskStoreSd: Store the lower double-precision (64-bit) floating-point element from 'a' into memory using writemask 'k'.

'mem_addr' must be aligned on a 16-byte boundary or a general-protection

exception may be generated.

IF k[0]
	MEM[mem_addr+63:mem_addr] := a[63:0]
FI

Instruction: 'VMOVSD'. Intrinsic: '_mm_mask_store_sd'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskStoreSs ¶

func MaskStoreSs(mem_addr *float32, k x86.Mmask8, a x86.M128)

MaskStoreSs: Store the lower single-precision (32-bit) floating-point element from 'a' into memory using writemask 'k'.

'mem_addr' must be aligned on a 16-byte boundary or a general-protection

exception may be generated.

IF k[0]
	MEM[mem_addr+31:mem_addr] := a[31:0]
FI

Instruction: 'VMOVSS'. Intrinsic: '_mm_mask_store_ss'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskSubEpi32 ¶

func MaskSubEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm_mask_sub_epi32'. Requires AVX512F.

func MaskSubEpi64 ¶

func MaskSubEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm_mask_sub_epi64'. Requires AVX512F.

func MaskSubPd ¶

func MaskSubPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm_mask_sub_pd'. Requires AVX512F.

func MaskSubPs ¶

func MaskSubPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm_mask_sub_ps'. Requires AVX512F.

func MaskSubRoundSd ¶

func MaskSubRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskSubRoundSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] - b[63:0]
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VSUBSD'. Intrinsic: '_mm_mask_sub_round_sd'. Requires AVX512F.

func MaskSubRoundSs ¶

func MaskSubRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskSubRoundSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] - b[31:0]
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VSUBSS'. Intrinsic: '_mm_mask_sub_round_ss'. Requires AVX512F.

func MaskSubSd ¶

func MaskSubSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskSubSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] - b[63:0]
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VSUBSD'. Intrinsic: '_mm_mask_sub_sd'. Requires AVX512F.

func MaskSubSs ¶

func MaskSubSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskSubSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] - b[31:0]
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VSUBSS'. Intrinsic: '_mm_mask_sub_ss'. Requires AVX512F.

func MaskTernarylogicEpi32 ¶

func MaskTernarylogicEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

MaskTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 32-bit granularity (32-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm_mask_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskTernarylogicEpi64 ¶

func MaskTernarylogicEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

MaskTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 64-bit granularity (64-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm_mask_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskTestEpi32Mask ¶

func MaskTestEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTMD'. Intrinsic: '_mm_mask_test_epi32_mask'. Requires AVX512F.

func MaskTestEpi64Mask ¶

func MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm_mask_test_epi64_mask'. Requires AVX512F.

func MaskTestnEpi32Mask ¶

func MaskTestnEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm_mask_testn_epi32_mask'. Requires AVX512F.

func MaskTestnEpi64Mask ¶

func MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm_mask_testn_epi64_mask'. Requires AVX512F.

func MaskUnpackhiEpi32 ¶

func MaskUnpackhiEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm_mask_unpackhi_epi32'. Requires AVX512F.

func MaskUnpackhiEpi64 ¶

func MaskUnpackhiEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm_mask_unpackhi_epi64'. Requires AVX512F.

func MaskUnpackhiPd ¶

func MaskUnpackhiPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm_mask_unpackhi_pd'. Requires AVX512F.

func MaskUnpackhiPs ¶

func MaskUnpackhiPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm_mask_unpackhi_ps'. Requires AVX512F.

func MaskUnpackloEpi32 ¶

func MaskUnpackloEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm_mask_unpacklo_epi32'. Requires AVX512F.

func MaskUnpackloEpi64 ¶

func MaskUnpackloEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm_mask_unpacklo_epi64'. Requires AVX512F.

func MaskUnpackloPd ¶

func MaskUnpackloPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm_mask_unpacklo_pd'. Requires AVX512F.

func MaskUnpackloPs ¶

func MaskUnpackloPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm_mask_unpacklo_ps'. Requires AVX512F.

func MaskXorEpi32 ¶

func MaskXorEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm_mask_xor_epi32'. Requires AVX512F.

func MaskXorEpi64 ¶

func MaskXorEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm_mask_xor_epi64'. Requires AVX512F.

func MaskzAbsEpi32 ¶

func MaskzAbsEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm_maskz_abs_epi32'. Requires AVX512F.

func MaskzAbsEpi64 ¶

func MaskzAbsEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm_maskz_abs_epi64'. Requires AVX512F.

func MaskzAddEpi32 ¶

func MaskzAddEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm_maskz_add_epi32'. Requires AVX512F.

func MaskzAddEpi64 ¶

func MaskzAddEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm_maskz_add_epi64'. Requires AVX512F.

func MaskzAddRoundSd ¶

func MaskzAddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzAddRoundSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] + b[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VADDSD'. Intrinsic: '_mm_maskz_add_round_sd'. Requires AVX512F.

func MaskzAddRoundSs ¶

func MaskzAddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzAddRoundSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] + b[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VADDSS'. Intrinsic: '_mm_maskz_add_round_ss'. Requires AVX512F.

func MaskzAddSd ¶

func MaskzAddSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzAddSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] + b[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VADDSD'. Intrinsic: '_mm_maskz_add_sd'. Requires AVX512F.

func MaskzAddSs ¶

func MaskzAddSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzAddSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] + b[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VADDSS'. Intrinsic: '_mm_maskz_add_ss'. Requires AVX512F.

func MaskzAndEpi32 ¶

func MaskzAndEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm_maskz_and_epi32'. Requires AVX512F.

func MaskzAndEpi64 ¶

func MaskzAndEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm_maskz_and_epi64'. Requires AVX512F.

func MaskzAndnotEpi32 ¶

func MaskzAndnotEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm_maskz_andnot_epi32'. Requires AVX512F.

func MaskzAndnotEpi64 ¶

func MaskzAndnotEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm_maskz_andnot_epi64'. Requires AVX512F.

func MaskzBroadcastdEpi32 ¶

func MaskzBroadcastdEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_maskz_broadcastd_epi32'. Requires AVX512F.

func MaskzBroadcastqEpi64 ¶

func MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_maskz_broadcastq_epi64'. Requires AVX512F.

func MaskzBroadcastssPs ¶

func MaskzBroadcastssPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm_maskz_broadcastss_ps'. Requires AVX512F.

func MaskzCompressEpi32 ¶

func MaskzCompressEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm_maskz_compress_epi32'. Requires AVX512F.

func MaskzCompressEpi64 ¶

func MaskzCompressEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm_maskz_compress_epi64'. Requires AVX512F.

func MaskzCompressPd ¶

func MaskzCompressPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm_maskz_compress_pd'. Requires AVX512F.

func MaskzCompressPs ¶

func MaskzCompressPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm_maskz_compress_ps'. Requires AVX512F.

func MaskzCvtRoundpsPh ¶

func MaskzCvtRoundpsPh(k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)

MaskzCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 3
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:64] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_maskz_cvt_roundps_ph'. Requires AVX512F.

func MaskzCvtRoundsdSs ¶

func MaskzCvtRoundsdSs(k x86.Mmask8, a x86.M128, b x86.M128d, rounding int) (dst x86.M128)

MaskzCvtRoundsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := Convert_FP64_To_FP32(b[63:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:31]
		dst[MAX:64] := 0

Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_maskz_cvt_roundsd_ss'. Requires AVX512F.

func MaskzCvtRoundssSd ¶

func MaskzCvtRoundssSd(k x86.Mmask8, a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)

MaskzCvtRoundssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := Convert_FP32_To_FP64(b[31:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:64] := 0

Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_maskz_cvt_roundss_sd'. Requires AVX512F.

func MaskzCvtepi16Epi32 ¶

func MaskzCvtepi16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm_maskz_cvtepi16_epi32'. Requires AVX512F.

func MaskzCvtepi16Epi64 ¶

func MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi16Epi64: Sign extend packed 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm_maskz_cvtepi16_epi64'. Requires AVX512F.

func MaskzCvtepi32Epi16 ¶

func MaskzCvtepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm_maskz_cvtepi32_epi16'. Requires AVX512F.

func MaskzCvtepi32Epi64 ¶

func MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm_maskz_cvtepi32_epi64'. Requires AVX512F.

func MaskzCvtepi32Epi8 ¶

func MaskzCvtepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm_maskz_cvtepi32_epi8'. Requires AVX512F.

func MaskzCvtepi32Pd ¶

func MaskzCvtepi32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskzCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm_maskz_cvtepi32_pd'. Requires AVX512F.

func MaskzCvtepi32Ps ¶

func MaskzCvtepi32Ps(k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskzCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm_maskz_cvtepi32_ps'. Requires AVX512F.

func MaskzCvtepi64Epi16 ¶

func MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm_maskz_cvtepi64_epi16'. Requires AVX512F.

func MaskzCvtepi64Epi32 ¶

func MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm_maskz_cvtepi64_epi32'. Requires AVX512F.

func MaskzCvtepi64Epi8 ¶

func MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm_maskz_cvtepi64_epi8'. Requires AVX512F.

func MaskzCvtepi8Epi32 ¶

func MaskzCvtepi8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi8Epi32: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm_maskz_cvtepi8_epi32'. Requires AVX512F.

func MaskzCvtepi8Epi64 ¶

func MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi8Epi64: Sign extend packed 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm_maskz_cvtepi8_epi64'. Requires AVX512F.

func MaskzCvtepu16Epi32 ¶

func MaskzCvtepu16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm_maskz_cvtepu16_epi32'. Requires AVX512F.

func MaskzCvtepu16Epi64 ¶

func MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm_maskz_cvtepu16_epi64'. Requires AVX512F.

func MaskzCvtepu32Epi64 ¶

func MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm_maskz_cvtepu32_epi64'. Requires AVX512F.

func MaskzCvtepu32Pd ¶

func MaskzCvtepu32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskzCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm_maskz_cvtepu32_pd'. Requires AVX512F.

func MaskzCvtepu8Epi32 ¶

func MaskzCvtepu8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in th elow 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm_maskz_cvtepu8_epi32'. Requires AVX512F.

func MaskzCvtepu8Epi64 ¶

func MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm_maskz_cvtepu8_epi64'. Requires AVX512F.

func MaskzCvtpdEpi32 ¶

func MaskzCvtpdEpi32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm_maskz_cvtpd_epi32'. Requires AVX512F.

func MaskzCvtpdEpu32 ¶

func MaskzCvtpdEpu32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm_maskz_cvtpd_epu32'. Requires AVX512F.

func MaskzCvtpdPs ¶

func MaskzCvtpdPs(k x86.Mmask8, a x86.M128d) (dst x86.M128)

MaskzCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm_maskz_cvtpd_ps'. Requires AVX512F.

func MaskzCvtphPs ¶

func MaskzCvtphPs(k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskzCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm_maskz_cvtph_ps'. Requires AVX512F.

func MaskzCvtpsEpi32 ¶

func MaskzCvtpsEpi32(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm_maskz_cvtps_epi32'. Requires AVX512F.

func MaskzCvtpsEpu32 ¶

func MaskzCvtpsEpu32(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm_maskz_cvtps_epu32'. Requires AVX512F.

func MaskzCvtpsPh ¶

func MaskzCvtpsPh(k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)

MaskzCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 3
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:64] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_maskz_cvtps_ph'. Requires AVX512F.

func MaskzCvtsdSs ¶

func MaskzCvtsdSs(k x86.Mmask8, a x86.M128, b x86.M128d) (dst x86.M128)

MaskzCvtsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:31]
dst[MAX:64] := 0

Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_maskz_cvtsd_ss'. Requires AVX512F.

func MaskzCvtsepi32Epi16 ¶

func MaskzCvtsepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm_maskz_cvtsepi32_epi16'. Requires AVX512F.

func MaskzCvtsepi32Epi8 ¶

func MaskzCvtsepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm_maskz_cvtsepi32_epi8'. Requires AVX512F.

func MaskzCvtsepi64Epi16 ¶

func MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm_maskz_cvtsepi64_epi16'. Requires AVX512F.

func MaskzCvtsepi64Epi32 ¶

func MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm_maskz_cvtsepi64_epi32'. Requires AVX512F.

func MaskzCvtsepi64Epi8 ¶

func MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm_maskz_cvtsepi64_epi8'. Requires AVX512F.

func MaskzCvtssSd ¶

func MaskzCvtssSd(k x86.Mmask8, a x86.M128d, b x86.M128) (dst x86.M128d)

MaskzCvtssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:64] := 0

Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_maskz_cvtss_sd'. Requires AVX512F.

func MaskzCvttpdEpi32 ¶

func MaskzCvttpdEpi32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm_maskz_cvttpd_epi32'. Requires AVX512F.

func MaskzCvttpdEpu32 ¶

func MaskzCvttpdEpu32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm_maskz_cvttpd_epu32'. Requires AVX512F.

func MaskzCvttpsEpi32 ¶

func MaskzCvttpsEpi32(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*i
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm_maskz_cvttps_epi32'. Requires AVX512F.

func MaskzCvttpsEpu32 ¶

func MaskzCvttpsEpu32(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm_maskz_cvttps_epu32'. Requires AVX512F.

func MaskzCvtusepi32Epi16 ¶

func MaskzCvtusepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm_maskz_cvtusepi32_epi16'. Requires AVX512F.

func MaskzCvtusepi32Epi8 ¶

func MaskzCvtusepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm_maskz_cvtusepi32_epi8'. Requires AVX512F.

func MaskzCvtusepi64Epi16 ¶

func MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm_maskz_cvtusepi64_epi16'. Requires AVX512F.

func MaskzCvtusepi64Epi32 ¶

func MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm_maskz_cvtusepi64_epi32'. Requires AVX512F.

func MaskzCvtusepi64Epi8 ¶

func MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm_maskz_cvtusepi64_epi8'. Requires AVX512F.

func MaskzDivPd ¶

func MaskzDivPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm_maskz_div_pd'. Requires AVX512F.

func MaskzDivPs ¶

func MaskzDivPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm_maskz_div_ps'. Requires AVX512F.

func MaskzDivRoundSd ¶

func MaskzDivRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzDivRoundSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] / b[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VDIVSD'. Intrinsic: '_mm_maskz_div_round_sd'. Requires AVX512F.

func MaskzDivRoundSs ¶

func MaskzDivRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzDivRoundSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] / b[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VDIVSS'. Intrinsic: '_mm_maskz_div_round_ss'. Requires AVX512F.

func MaskzDivSd ¶

func MaskzDivSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzDivSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] / b[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VDIVSD'. Intrinsic: '_mm_maskz_div_sd'. Requires AVX512F.

func MaskzDivSs ¶

func MaskzDivSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzDivSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] / b[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VDIVSS'. Intrinsic: '_mm_maskz_div_ss'. Requires AVX512F.

func MaskzExpandEpi32 ¶

func MaskzExpandEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm_maskz_expand_epi32'. Requires AVX512F.

func MaskzExpandEpi64 ¶

func MaskzExpandEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm_maskz_expand_epi64'. Requires AVX512F.

func MaskzExpandPd ¶

func MaskzExpandPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm_maskz_expand_pd'. Requires AVX512F.

func MaskzExpandPs ¶

func MaskzExpandPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm_maskz_expand_ps'. Requires AVX512F.

func MaskzFixupimmPd ¶

func MaskzFixupimmPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

MaskzFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm_maskz_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFixupimmPs ¶

func MaskzFixupimmPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

MaskzFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm_maskz_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFixupimmRoundSd ¶

func MaskzFixupimmRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)

MaskzFixupimmRoundSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		IF k[0]
			dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_maskz_fixupimm_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFixupimmRoundSs ¶

func MaskzFixupimmRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)

MaskzFixupimmRoundSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		IF k[0]
			dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_maskz_fixupimm_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFixupimmSd ¶

func MaskzFixupimmSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

MaskzFixupimmSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

IF k[0]
	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_maskz_fixupimm_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFixupimmSs ¶

func MaskzFixupimmSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

MaskzFixupimmSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

IF k[0]
	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_maskz_fixupimm_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFmaddPd ¶

func MaskzFmaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm_maskz_fmadd_pd'. Requires AVX512F.

func MaskzFmaddPs ¶

func MaskzFmaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm_maskz_fmadd_ps'. Requires AVX512F.

func MaskzFmaddRoundSd ¶

func MaskzFmaddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskzFmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_maskz_fmadd_round_sd'. Requires AVX512F.

func MaskzFmaddRoundSs ¶

func MaskzFmaddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskzFmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_maskz_fmadd_round_ss'. Requires AVX512F.

func MaskzFmaddSd ¶

func MaskzFmaddSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_maskz_fmadd_sd'. Requires AVX512F.

func MaskzFmaddSs ¶

func MaskzFmaddSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_maskz_fmadd_ss'. Requires AVX512F.

func MaskzFmaddsubPd ¶

func MaskzFmaddsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm_maskz_fmaddsub_pd'. Requires AVX512F.

func MaskzFmaddsubPs ¶

func MaskzFmaddsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm_maskz_fmaddsub_ps'. Requires AVX512F.

func MaskzFmsubPd ¶

func MaskzFmsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm_maskz_fmsub_pd'. Requires AVX512F.

func MaskzFmsubPs ¶

func MaskzFmsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm_maskz_fmsub_ps'. Requires AVX512F.

func MaskzFmsubRoundSd ¶

func MaskzFmsubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskzFmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_maskz_fmsub_round_sd'. Requires AVX512F.

func MaskzFmsubRoundSs ¶

func MaskzFmsubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskzFmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_maskz_fmsub_round_ss'. Requires AVX512F.

func MaskzFmsubSd ¶

func MaskzFmsubSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_maskz_fmsub_sd'. Requires AVX512F.

func MaskzFmsubSs ¶

func MaskzFmsubSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_maskz_fmsub_ss'. Requires AVX512F.

func MaskzFmsubaddPd ¶

func MaskzFmsubaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm_maskz_fmsubadd_pd'. Requires AVX512F.

func MaskzFmsubaddPs ¶

func MaskzFmsubaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm_maskz_fmsubadd_ps'. Requires AVX512F.

func MaskzFnmaddPd ¶

func MaskzFnmaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm_maskz_fnmadd_pd'. Requires AVX512F.

func MaskzFnmaddPs ¶

func MaskzFnmaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm_maskz_fnmadd_ps'. Requires AVX512F.

func MaskzFnmaddRoundSd ¶

func MaskzFnmaddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskzFnmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_maskz_fnmadd_round_sd'. Requires AVX512F.

func MaskzFnmaddRoundSs ¶

func MaskzFnmaddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskzFnmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_maskz_fnmadd_round_ss'. Requires AVX512F.

func MaskzFnmaddSd ¶

func MaskzFnmaddSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFnmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMADD213SD, VFNMADD231SD, VFNMADD132SD'. Intrinsic: '_mm_maskz_fnmadd_sd'. Requires AVX512F.

func MaskzFnmaddSs ¶

func MaskzFnmaddSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFnmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_maskz_fnmadd_ss'. Requires AVX512F.

func MaskzFnmsubPd ¶

func MaskzFnmsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm_maskz_fnmsub_pd'. Requires AVX512F.

func MaskzFnmsubPs ¶

func MaskzFnmsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm_maskz_fnmsub_ps'. Requires AVX512F.

func MaskzFnmsubRoundSd ¶

func MaskzFnmsubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskzFnmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_maskz_fnmsub_round_sd'. Requires AVX512F.

func MaskzFnmsubRoundSs ¶

func MaskzFnmsubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskzFnmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_maskz_fnmsub_round_ss'. Requires AVX512F.

func MaskzFnmsubSd ¶

func MaskzFnmsubSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFnmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_maskz_fnmsub_sd'. Requires AVX512F.

func MaskzFnmsubSs ¶

func MaskzFnmsubSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFnmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_maskz_fnmsub_ss'. Requires AVX512F.

func MaskzGetexpPd ¶

func MaskzGetexpPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm_maskz_getexp_pd'. Requires AVX512F.

func MaskzGetexpPs ¶

func MaskzGetexpPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm_maskz_getexp_ps'. Requires AVX512F.

func MaskzGetexpRoundSd ¶

func MaskzGetexpRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzGetexpRoundSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := ConvertExpFP64(b[63:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_maskz_getexp_round_sd'. Requires AVX512F.

func MaskzGetexpRoundSs ¶

func MaskzGetexpRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzGetexpRoundSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := ConvertExpFP32(b[31:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_maskz_getexp_round_ss'. Requires AVX512F.

func MaskzGetexpSd ¶

func MaskzGetexpSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzGetexpSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

IF k[0]
	dst[63:0] := ConvertExpFP64(b[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_maskz_getexp_sd'. Requires AVX512F.

func MaskzGetexpSs ¶

func MaskzGetexpSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzGetexpSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

IF k[0]
	dst[31:0] := ConvertExpFP32(b[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_maskz_getexp_ss'. Requires AVX512F.

func MaskzGetmantPd ¶

func MaskzGetmantPd(k x86.Mmask8, a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

MaskzGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 1
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm_maskz_getmant_pd'. Requires AVX512F.

func MaskzGetmantPs ¶

func MaskzGetmantPs(k x86.Mmask8, a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

MaskzGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm_maskz_getmant_ps'. Requires AVX512F.

func MaskzGetmantRoundSd ¶

func MaskzGetmantRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128d)

MaskzGetmantRoundSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_maskz_getmant_round_sd'. Requires AVX512F.

func MaskzGetmantRoundSs ¶

func MaskzGetmantRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128)

MaskzGetmantRoundSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_maskz_getmant_round_ss'. Requires AVX512F.

func MaskzGetmantSd ¶

func MaskzGetmantSd(k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

MaskzGetmantSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		IF k[0]
			dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_maskz_getmant_sd'. Requires AVX512F.

func MaskzGetmantSs ¶

func MaskzGetmantSs(k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

MaskzGetmantSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		IF k[0]
			dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_maskz_getmant_ss'. Requires AVX512F.

func MaskzLoadSd ¶

func MaskzLoadSd(k x86.Mmask8, mem_addr *float64) (dst x86.M128d)

MaskzLoadSd: Load a double-precision (64-bit) floating-point element from memory into the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and set the upper element of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

IF k[0]
	dst[63:0] := MEM[mem_addr+63:mem_addr]
ELSE
	dst[63:0] := 0
FI
dst[MAX:64] := 0

Instruction: 'VMOVSD'. Intrinsic: '_mm_maskz_load_sd'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskzLoadSs ¶

func MaskzLoadSs(k x86.Mmask8, mem_addr *float32) (dst x86.M128)

MaskzLoadSs: Load a single-precision (32-bit) floating-point element from memory into the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and set the upper elements of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

IF k[0]
	dst[31:0] := MEM[mem_addr+31:mem_addr]
ELSE
	dst[31:0] := 0
FI
dst[MAX:32] := 0

Instruction: 'VMOVSS'. Intrinsic: '_mm_maskz_load_ss'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskzMaxEpi32 ¶

func MaskzMaxEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm_maskz_max_epi32'. Requires AVX512F.

func MaskzMaxEpi64 ¶

func MaskzMaxEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm_maskz_max_epi64'. Requires AVX512F.

func MaskzMaxEpu32 ¶

func MaskzMaxEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm_maskz_max_epu32'. Requires AVX512F.

func MaskzMaxEpu64 ¶

func MaskzMaxEpu64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm_maskz_max_epu64'. Requires AVX512F.

func MaskzMaxPd ¶

func MaskzMaxPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm_maskz_max_pd'. Requires AVX512F.

func MaskzMaxPs ¶

func MaskzMaxPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm_maskz_max_ps'. Requires AVX512F.

func MaskzMaxRoundSd ¶

func MaskzMaxRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MaskzMaxRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[63:0] := MAX(a[63:0], b[63:0])
	ELSE
		dst[63:0] := 0
	FI
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMAXSD'. Intrinsic: '_mm_maskz_max_round_sd'. Requires AVX512F.

func MaskzMaxRoundSs ¶

func MaskzMaxRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)

MaskzMaxRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[31:0] := MAX(a[31:0], b[31:0])
	ELSE
		dst[31:0] := 0
	FI
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMAXSS'. Intrinsic: '_mm_maskz_max_round_ss'. Requires AVX512F.

func MaskzMaxSd ¶

func MaskzMaxSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMaxSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := MAX(a[63:0], b[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMAXSD'. Intrinsic: '_mm_maskz_max_sd'. Requires AVX512F.

func MaskzMaxSs ¶

func MaskzMaxSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMaxSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[31:0] := MAX(a[31:0], b[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMAXSS'. Intrinsic: '_mm_maskz_max_ss'. Requires AVX512F.

func MaskzMinEpi32 ¶

func MaskzMinEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm_maskz_min_epi32'. Requires AVX512F.

func MaskzMinEpi64 ¶

func MaskzMinEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm_maskz_min_epi64'. Requires AVX512F.

func MaskzMinEpu32 ¶

func MaskzMinEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm_maskz_min_epu32'. Requires AVX512F.

func MaskzMinEpu64 ¶

func MaskzMinEpu64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm_maskz_min_epu64'. Requires AVX512F.

func MaskzMinPd ¶

func MaskzMinPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm_maskz_min_pd'. Requires AVX512F.

func MaskzMinPs ¶

func MaskzMinPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm_maskz_min_ps'. Requires AVX512F.

func MaskzMinRoundSd ¶

func MaskzMinRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MaskzMinRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[63:0] := MIN(a[63:0], b[63:0])
	ELSE
		dst[63:0] := 0
	FI
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMINSD'. Intrinsic: '_mm_maskz_min_round_sd'. Requires AVX512F.

func MaskzMinRoundSs ¶

func MaskzMinRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)

MaskzMinRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[31:0] := MIN(a[31:0], b[31:0])
	ELSE
		dst[31:0] := 0
	FI
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMINSS'. Intrinsic: '_mm_maskz_min_round_ss'. Requires AVX512F.

func MaskzMinSd ¶

func MaskzMinSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMinSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := MIN(a[63:0], b[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMINSD'. Intrinsic: '_mm_maskz_min_sd'. Requires AVX512F.

func MaskzMinSs ¶

func MaskzMinSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMinSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[31:0] := MIN(a[31:0], b[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMINSS'. Intrinsic: '_mm_maskz_min_ss'. Requires AVX512F.

func MaskzMovEpi32 ¶

func MaskzMovEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzMovEpi32: Move packed 32-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm_maskz_mov_epi32'. Requires AVX512F.

func MaskzMovEpi64 ¶

func MaskzMovEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzMovEpi64: Move packed 64-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm_maskz_mov_epi64'. Requires AVX512F.

func MaskzMovPd ¶

func MaskzMovPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm_maskz_mov_pd'. Requires AVX512F.

func MaskzMovPs ¶

func MaskzMovPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm_maskz_mov_ps'. Requires AVX512F.

func MaskzMoveSd ¶

func MaskzMoveSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMoveSd: Move the lower double-precision (64-bit) floating-point element from 'b' to the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := b[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMOVSD'. Intrinsic: '_mm_maskz_move_sd'. Requires AVX512F.

func MaskzMoveSs ¶

func MaskzMoveSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMoveSs: Move the lower single-precision (32-bit) floating-point element from 'b' to the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := b[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMOVSS'. Intrinsic: '_mm_maskz_move_ss'. Requires AVX512F.

func MaskzMovedupPd ¶

func MaskzMovedupPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm_maskz_movedup_pd'. Requires AVX512F.

func MaskzMovehdupPs ¶

func MaskzMovehdupPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm_maskz_movehdup_ps'. Requires AVX512F.

func MaskzMoveldupPs ¶

func MaskzMoveldupPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm_maskz_moveldup_ps'. Requires AVX512F.

func MaskzMulEpi32 ¶

func MaskzMulEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm_maskz_mul_epi32'. Requires AVX512F.

func MaskzMulEpu32 ¶

func MaskzMulEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm_maskz_mul_epu32'. Requires AVX512F.

func MaskzMulPd ¶

func MaskzMulPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm_maskz_mul_pd'. Requires AVX512F.

func MaskzMulPs ¶

func MaskzMulPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm_maskz_mul_ps'. Requires AVX512F.

func MaskzMulRoundSd ¶

func MaskzMulRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzMulRoundSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] * b[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VMULSD'. Intrinsic: '_mm_maskz_mul_round_sd'. Requires AVX512F.

func MaskzMulRoundSs ¶

func MaskzMulRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzMulRoundSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] * b[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VMULSS'. Intrinsic: '_mm_maskz_mul_round_ss'. Requires AVX512F.

func MaskzMulSd ¶

func MaskzMulSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMulSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] * b[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMULSD'. Intrinsic: '_mm_maskz_mul_sd'. Requires AVX512F.

func MaskzMulSs ¶

func MaskzMulSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMulSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] * b[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMULSS'. Intrinsic: '_mm_maskz_mul_ss'. Requires AVX512F.

func MaskzMulloEpi32 ¶

func MaskzMulloEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm_maskz_mullo_epi32'. Requires AVX512F.

func MaskzOrEpi32 ¶

func MaskzOrEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPORD'. Intrinsic: '_mm_maskz_or_epi32'. Requires AVX512F.

func MaskzOrEpi64 ¶

func MaskzOrEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm_maskz_or_epi64'. Requires AVX512F.

func MaskzPermutePd ¶

func MaskzPermutePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskzPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm_maskz_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzPermutePs ¶

func MaskzPermutePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskzPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm_maskz_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzPermutevarPd ¶

func MaskzPermutevarPd(k x86.Mmask8, a x86.M128d, b x86.M128i) (dst x86.M128d)

MaskzPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm_maskz_permutevar_pd'. Requires AVX512F.

func MaskzPermutevarPs ¶

func MaskzPermutevarPs(k x86.Mmask8, a x86.M128, b x86.M128i) (dst x86.M128)

MaskzPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm_maskz_permutevar_ps'. Requires AVX512F.

func MaskzPermutex2varEpi32 ¶

func MaskzPermutex2varEpi32(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm_maskz_permutex2var_epi32'. Requires AVX512F.

func MaskzPermutex2varEpi64 ¶

func MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm_maskz_permutex2var_epi64'. Requires AVX512F.

func MaskzPermutex2varPd ¶

func MaskzPermutex2varPd(k x86.Mmask8, a x86.M128d, idx x86.M128i, b x86.M128d) (dst x86.M128d)

MaskzPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm_maskz_permutex2var_pd'. Requires AVX512F.

func MaskzPermutex2varPs ¶

func MaskzPermutex2varPs(k x86.Mmask8, a x86.M128, idx x86.M128i, b x86.M128) (dst x86.M128)

MaskzPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm_maskz_permutex2var_ps'. Requires AVX512F.

func MaskzRcp14Pd ¶

func MaskzRcp14Pd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm_maskz_rcp14_pd'. Requires AVX512F.

func MaskzRcp14Ps ¶

func MaskzRcp14Ps(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm_maskz_rcp14_ps'. Requires AVX512F.

func MaskzRcp14Sd ¶

func MaskzRcp14Sd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzRcp14Sd: Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[63:0] := APPROXIMATE(1.0/b[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRCP14SD'. Intrinsic: '_mm_maskz_rcp14_sd'. Requires AVX512F.

func MaskzRcp14Ss ¶

func MaskzRcp14Ss(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzRcp14Ss: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[31:0] := APPROXIMATE(1.0/b[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRCP14SS'. Intrinsic: '_mm_maskz_rcp14_ss'. Requires AVX512F.

func MaskzRolEpi32 ¶

func MaskzRolEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm_maskz_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRolEpi64 ¶

func MaskzRolEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm_maskz_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRolvEpi32 ¶

func MaskzRolvEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm_maskz_rolv_epi32'. Requires AVX512F.

func MaskzRolvEpi64 ¶

func MaskzRolvEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm_maskz_rolv_epi64'. Requires AVX512F.

func MaskzRorEpi32 ¶

func MaskzRorEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm_maskz_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRorEpi64 ¶

func MaskzRorEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm_maskz_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRorvEpi32 ¶

func MaskzRorvEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm_maskz_rorv_epi32'. Requires AVX512F.

func MaskzRorvEpi64 ¶

func MaskzRorvEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm_maskz_rorv_epi64'. Requires AVX512F.

func MaskzRoundscalePd ¶

func MaskzRoundscalePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskzRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm_maskz_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRoundscalePs ¶

func MaskzRoundscalePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskzRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm_maskz_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRoundscaleRoundSd ¶

func MaskzRoundscaleRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskzRoundscaleRoundSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		IF k[0]
			dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_maskz_roundscale_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRoundscaleRoundSs ¶

func MaskzRoundscaleRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskzRoundscaleRoundSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		IF k[0]
			dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_maskz_roundscale_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRoundscaleSd ¶

func MaskzRoundscaleSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskzRoundscaleSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

IF k[0]
	dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_maskz_roundscale_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRoundscaleSs ¶

func MaskzRoundscaleSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskzRoundscaleSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

IF k[0]
	dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_maskz_roundscale_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRsqrt14Pd ¶

func MaskzRsqrt14Pd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm_maskz_rsqrt14_pd'. Requires AVX512F.

func MaskzRsqrt14Ps ¶

func MaskzRsqrt14Ps(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm_maskz_rsqrt14_ps'. Requires AVX512F.

func MaskzRsqrt14Sd ¶

func MaskzRsqrt14Sd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzRsqrt14Sd: Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SD'. Intrinsic: '_mm_maskz_rsqrt14_sd'. Requires AVX512F.

func MaskzRsqrt14Ss ¶

func MaskzRsqrt14Ss(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzRsqrt14Ss: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SS'. Intrinsic: '_mm_maskz_rsqrt14_ss'. Requires AVX512F.

func MaskzScalefPd ¶

func MaskzScalefPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm_maskz_scalef_pd'. Requires AVX512F.

func MaskzScalefPs ¶

func MaskzScalefPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm_maskz_scalef_ps'. Requires AVX512F.

func MaskzScalefRoundSd ¶

func MaskzScalefRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzScalefRoundSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		IF k[0]
			dst[63:0] := SCALE(a[63:0], b[63:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_maskz_scalef_round_sd'. Requires AVX512F.

func MaskzScalefRoundSs ¶

func MaskzScalefRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzScalefRoundSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[63:0]
		}

		IF k[0]
			dst[31:0] := SCALE(a[31:0], b[31:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_maskz_scalef_round_ss'. Requires AVX512F.

func MaskzScalefSd ¶

func MaskzScalefSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzScalefSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

IF k[0]
	dst[63:0] := SCALE(a[63:0], b[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_maskz_scalef_sd'. Requires AVX512F.

func MaskzScalefSs ¶

func MaskzScalefSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzScalefSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[63:0]
}

IF k[0]
	dst[31:0] := SCALE(a[31:0], b[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_maskz_scalef_ss'. Requires AVX512F.

func MaskzSet1Epi32 ¶

func MaskzSet1Epi32(k x86.Mmask8, a int) (dst x86.M128i)

MaskzSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_maskz_set1_epi32'. Requires AVX512F.

func MaskzSet1Epi64 ¶

func MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M128i)

MaskzSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_maskz_set1_epi64'. Requires AVX512F.

func MaskzShuffleEpi32 ¶

func MaskzShuffleEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzShuffleEpi32: Shuffle 32-bit integers in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm_maskz_shuffle_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzShufflePd ¶

func MaskzShufflePd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskzShufflePd: Shuffle double-precision (64-bit) floating-point elements using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm_maskz_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzShufflePs ¶

func MaskzShufflePs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskzShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm_maskz_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSllEpi32 ¶

func MaskzSllEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm_maskz_sll_epi32'. Requires AVX512F.

func MaskzSllEpi64 ¶

func MaskzSllEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm_maskz_sll_epi64'. Requires AVX512F.

func MaskzSlliEpi32 ¶

func MaskzSlliEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm_maskz_slli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSlliEpi64 ¶

func MaskzSlliEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm_maskz_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSllvEpi32 ¶

func MaskzSllvEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm_maskz_sllv_epi32'. Requires AVX512F.

func MaskzSllvEpi64 ¶

func MaskzSllvEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm_maskz_sllv_epi64'. Requires AVX512F.

func MaskzSqrtPd ¶

func MaskzSqrtPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm_maskz_sqrt_pd'. Requires AVX512F.

func MaskzSqrtPs ¶

func MaskzSqrtPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm_maskz_sqrt_ps'. Requires AVX512F.

func MaskzSqrtRoundSd ¶

func MaskzSqrtRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzSqrtRoundSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := SQRT(a[63:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSQRTSD'. Intrinsic: '_mm_maskz_sqrt_round_sd'. Requires AVX512F.

func MaskzSqrtRoundSs ¶

func MaskzSqrtRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzSqrtRoundSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := SQRT(a[31:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSQRTSS'. Intrinsic: '_mm_maskz_sqrt_round_ss'. Requires AVX512F.

func MaskzSqrtSd ¶

func MaskzSqrtSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzSqrtSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := SQRT(a[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VSQRTSD'. Intrinsic: '_mm_maskz_sqrt_sd'. Requires AVX512F.

func MaskzSqrtSs ¶

func MaskzSqrtSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzSqrtSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := SQRT(a[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VSQRTSS'. Intrinsic: '_mm_maskz_sqrt_ss'. Requires AVX512F.

func MaskzSraEpi32 ¶

func MaskzSraEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm_maskz_sra_epi32'. Requires AVX512F.

func MaskzSraEpi64 ¶

func MaskzSraEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_maskz_sra_epi64'. Requires AVX512F.

func MaskzSraiEpi32 ¶

func MaskzSraiEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm_maskz_srai_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSraiEpi64 ¶

func MaskzSraiEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_maskz_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSravEpi32 ¶

func MaskzSravEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm_maskz_srav_epi32'. Requires AVX512F.

func MaskzSravEpi64 ¶

func MaskzSravEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm_maskz_srav_epi64'. Requires AVX512F.

func MaskzSrlEpi32 ¶

func MaskzSrlEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm_maskz_srl_epi32'. Requires AVX512F.

func MaskzSrlEpi64 ¶

func MaskzSrlEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm_maskz_srl_epi64'. Requires AVX512F.

func MaskzSrliEpi32 ¶

func MaskzSrliEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm_maskz_srli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSrliEpi64 ¶

func MaskzSrliEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm_maskz_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSrlvEpi32 ¶

func MaskzSrlvEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm_maskz_srlv_epi32'. Requires AVX512F.

func MaskzSrlvEpi64 ¶

func MaskzSrlvEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm_maskz_srlv_epi64'. Requires AVX512F.

func MaskzSubEpi32 ¶

func MaskzSubEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm_maskz_sub_epi32'. Requires AVX512F.

func MaskzSubEpi64 ¶

func MaskzSubEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm_maskz_sub_epi64'. Requires AVX512F.

func MaskzSubPd ¶

func MaskzSubPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm_maskz_sub_pd'. Requires AVX512F.

func MaskzSubPs ¶

func MaskzSubPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm_maskz_sub_ps'. Requires AVX512F.

func MaskzSubRoundSd ¶

func MaskzSubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzSubRoundSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] - b[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VSUBSD'. Intrinsic: '_mm_maskz_sub_round_sd'. Requires AVX512F.

func MaskzSubRoundSs ¶

func MaskzSubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzSubRoundSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] - b[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VSUBSS'. Intrinsic: '_mm_maskz_sub_round_ss'. Requires AVX512F.

func MaskzSubSd ¶

func MaskzSubSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzSubSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] - b[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VSUBSD'. Intrinsic: '_mm_maskz_sub_sd'. Requires AVX512F.

func MaskzSubSs ¶

func MaskzSubSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzSubSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] - b[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VSUBSS'. Intrinsic: '_mm_maskz_sub_ss'. Requires AVX512F.

func MaskzTernarylogicEpi32 ¶

func MaskzTernarylogicEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)

MaskzTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm_maskz_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzTernarylogicEpi64 ¶

func MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)

MaskzTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm_maskz_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzUnpackhiEpi32 ¶

func MaskzUnpackhiEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm_maskz_unpackhi_epi32'. Requires AVX512F.

func MaskzUnpackhiEpi64 ¶

func MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm_maskz_unpackhi_epi64'. Requires AVX512F.

func MaskzUnpackhiPd ¶

func MaskzUnpackhiPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm_maskz_unpackhi_pd'. Requires AVX512F.

func MaskzUnpackhiPs ¶

func MaskzUnpackhiPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm_maskz_unpackhi_ps'. Requires AVX512F.

func MaskzUnpackloEpi32 ¶

func MaskzUnpackloEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm_maskz_unpacklo_epi32'. Requires AVX512F.

func MaskzUnpackloEpi64 ¶

func MaskzUnpackloEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm_maskz_unpacklo_epi64'. Requires AVX512F.

func MaskzUnpackloPd ¶

func MaskzUnpackloPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm_maskz_unpacklo_pd'. Requires AVX512F.

func MaskzUnpackloPs ¶

func MaskzUnpackloPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm_maskz_unpacklo_ps'. Requires AVX512F.

func MaskzXorEpi32 ¶

func MaskzXorEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm_maskz_xor_epi32'. Requires AVX512F.

func MaskzXorEpi64 ¶

func MaskzXorEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm_maskz_xor_epi64'. Requires AVX512F.

func MaxEpi64 ¶

func MaxEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

MaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm_max_epi64'. Requires AVX512F.

func MaxEpu64 ¶

func MaxEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)

MaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm_max_epu64'. Requires AVX512F.

func MaxRoundSd ¶

func MaxRoundSd(a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MaxRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	dst[63:0] := MAX(a[63:0], b[63:0])
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMAXSD'. Intrinsic: '_mm_max_round_sd'. Requires AVX512F.

func MaxRoundSs ¶

func MaxRoundSs(a x86.M128, b x86.M128, sae int) (dst x86.M128)

MaxRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	dst[31:0] := MAX(a[31:0], b[31:0])
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMAXSS'. Intrinsic: '_mm_max_round_ss'. Requires AVX512F.

func MinEpi64 ¶

func MinEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

MinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm_min_epi64'. Requires AVX512F.

func MinEpu64 ¶

func MinEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)

MinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm_min_epu64'. Requires AVX512F.

func MinRoundSd ¶

func MinRoundSd(a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MinRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' , and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	dst[63:0] := MIN(a[63:0], b[63:0])
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMINSD'. Intrinsic: '_mm_min_round_sd'. Requires AVX512F.

func MinRoundSs ¶

func MinRoundSs(a x86.M128, b x86.M128, sae int) (dst x86.M128)

MinRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	dst[31:0] := MIN(a[31:0], b[31:0])
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMINSS'. Intrinsic: '_mm_min_round_ss'. Requires AVX512F.

func MulRoundSd ¶

func MulRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MulRoundSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := a[63:0] * b[63:0]
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VMULSD'. Intrinsic: '_mm_mul_round_sd'. Requires AVX512F.

func MulRoundSs ¶

func MulRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MulRoundSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := a[31:0] * b[31:0]
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VMULSS'. Intrinsic: '_mm_mul_round_ss'. Requires AVX512F.

func Permutex2varEpi32 ¶

func Permutex2varEpi32(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	off := idx[i+2:i]*32
	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm_permutex2var_epi32'. Requires AVX512F.

func Permutex2varEpi64 ¶

func Permutex2varEpi64(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm_permutex2var_epi64'. Requires AVX512F.

func Permutex2varPd ¶

func Permutex2varPd(a x86.M128d, idx x86.M128i, b x86.M128d) (dst x86.M128d)

Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm_permutex2var_pd'. Requires AVX512F.

func Permutex2varPs ¶

func Permutex2varPs(a x86.M128, idx x86.M128i, b x86.M128) (dst x86.M128)

Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm_permutex2var_ps'. Requires AVX512F.

func Rcp14Pd ¶

func Rcp14Pd(a x86.M128d) (dst x86.M128d)

Rcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm_rcp14_pd'. Requires AVX512F.

func Rcp14Ps ¶

func Rcp14Ps(a x86.M128) (dst x86.M128)

Rcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm_rcp14_ps'. Requires AVX512F.

func Rcp14Sd ¶

func Rcp14Sd(a x86.M128d, b x86.M128d) (dst x86.M128d)

Rcp14Sd: Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

dst[63:0] := APPROXIMATE(1.0/b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRCP14SD'. Intrinsic: '_mm_rcp14_sd'. Requires AVX512F.

func Rcp14Ss ¶

func Rcp14Ss(a x86.M128, b x86.M128) (dst x86.M128)

Rcp14Ss: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

dst[31:0] := APPROXIMATE(1.0/b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRCP14SS'. Intrinsic: '_mm_rcp14_ss'. Requires AVX512F.

func RolEpi32 ¶

func RolEpi32(a x86.M128i, imm8 byte) (dst x86.M128i)

RolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RolEpi64 ¶

func RolEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)

RolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RolvEpi32 ¶

func RolvEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

RolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm_rolv_epi32'. Requires AVX512F.

func RolvEpi64 ¶

func RolvEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

RolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm_rolv_epi64'. Requires AVX512F.

func RorEpi32 ¶

func RorEpi32(a x86.M128i, imm8 byte) (dst x86.M128i)

RorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RorEpi64 ¶

func RorEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)

RorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RorvEpi32 ¶

func RorvEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

RorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm_rorv_epi32'. Requires AVX512F.

func RorvEpi64 ¶

func RorvEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

RorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm_rorv_epi64'. Requires AVX512F.

func RoundscalePd ¶

func RoundscalePd(a x86.M128d, imm8 byte) (dst x86.M128d)

RoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RoundscalePs ¶

func RoundscalePs(a x86.M128, imm8 byte) (dst x86.M128)

RoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RoundscaleRoundSd ¶

func RoundscaleRoundSd(a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

RoundscaleRoundSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_roundscale_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RoundscaleRoundSs ¶

func RoundscaleRoundSs(a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

RoundscaleRoundSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_roundscale_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RoundscaleSd ¶

func RoundscaleSd(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

RoundscaleSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_roundscale_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RoundscaleSs ¶

func RoundscaleSs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

RoundscaleSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_roundscale_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func Rsqrt14Sd ¶

func Rsqrt14Sd(a x86.M128d, b x86.M128d) (dst x86.M128d)

Rsqrt14Sd: Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SD'. Intrinsic: '_mm_rsqrt14_sd'. Requires AVX512F.

func Rsqrt14Ss ¶

func Rsqrt14Ss(a x86.M128, b x86.M128) (dst x86.M128)

Rsqrt14Ss: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SS'. Intrinsic: '_mm_rsqrt14_ss'. Requires AVX512F.

func ScalefPd ¶

func ScalefPd(a x86.M128d, b x86.M128d) (dst x86.M128d)

ScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm_scalef_pd'. Requires AVX512F.

func ScalefPs ¶

func ScalefPs(a x86.M128, b x86.M128) (dst x86.M128)

ScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm_scalef_ps'. Requires AVX512F.

func ScalefRoundSd ¶

func ScalefRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

ScalefRoundSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		dst[63:0] := SCALE(a[63:0], b[63:0])
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_scalef_round_sd'. Requires AVX512F.

func ScalefRoundSs ¶

func ScalefRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

ScalefRoundSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[63:0]
		}

		dst[31:0] := SCALE(a[31:0], b[31:0])
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_scalef_round_ss'. Requires AVX512F.

func ScalefSd ¶

func ScalefSd(a x86.M128d, b x86.M128d) (dst x86.M128d)

ScalefSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

dst[63:0] := SCALE(a[63:0], b[63:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_scalef_sd'. Requires AVX512F.

func ScalefSs ¶

func ScalefSs(a x86.M128, b x86.M128) (dst x86.M128)

ScalefSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[63:0]
}

dst[31:0] := SCALE(a[31:0], b[31:0])
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_scalef_ss'. Requires AVX512F.

func SqrtRoundSd ¶

func SqrtRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

SqrtRoundSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := SQRT(a[63:0])
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSQRTSD'. Intrinsic: '_mm_sqrt_round_sd'. Requires AVX512F.

func SqrtRoundSs ¶

func SqrtRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

SqrtRoundSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := SQRT(a[31:0])
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSQRTSS'. Intrinsic: '_mm_sqrt_round_ss'. Requires AVX512F.

func SraEpi64 ¶

func SraEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)

SraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_sra_epi64'. Requires AVX512F.

func SraiEpi64 ¶

func SraiEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)

SraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func SravEpi64 ¶

func SravEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)

SravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm_srav_epi64'. Requires AVX512F.

func SubRoundSd ¶

func SubRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

SubRoundSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := a[63:0] - b[63:0]
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VSUBSD'. Intrinsic: '_mm_sub_round_sd'. Requires AVX512F.

func SubRoundSs ¶

func SubRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

SubRoundSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := a[31:0] - b[31:0]
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VSUBSS'. Intrinsic: '_mm_sub_round_ss'. Requires AVX512F.

func TernarylogicEpi32 ¶

func TernarylogicEpi32(a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)

TernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 3
	i := j*32
	FOR h := 0 to 31
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func TernarylogicEpi64 ¶

func TernarylogicEpi64(a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)

TernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 1
	i := j*64
	FOR h := 0 to 63
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func TestEpi32Mask ¶

func TestEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 3
	i := j*32
	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTMD'. Intrinsic: '_mm_test_epi32_mask'. Requires AVX512F.

func TestEpi64Mask ¶

func TestEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 1
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm_test_epi64_mask'. Requires AVX512F.

func TestnEpi32Mask ¶

func TestnEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 3
	i := j*32
	k[j] := ((a[i+31:i] NAND b[i+31:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm_testn_epi32_mask'. Requires AVX512F.

func TestnEpi64Mask ¶

func TestnEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 1
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm_testn_epi64_mask'. Requires AVX512F.

Types ¶

This section is empty.

Source Files ¶

View all Source files

avx512f.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL