avx512bw

package

v0.0.0-...-3878f85 Latest Latest Go to latest Published: Jul 23, 2017 License: MIT Imports: 1 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/klauspost/intrinsics

Links

Open Source Insights

Documentation ¶

Overview ¶

THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!

THEY DO NOT NOT CONTAIN WORKING INTRINSICS!

See https://github.com/klauspost/intrinsics

Index ¶

func CmpEpi16Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func CmpEpi8Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)
func CmpEpu16Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func CmpEpu8Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)
func CmpeqEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpeqEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpeqEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpeqEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpgeEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgeEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpgeEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgeEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpgtEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgtEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpgtEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpgtEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpleEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpleEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpleEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpleEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpltEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpltEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpltEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpltEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpneqEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpneqEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func CmpneqEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func CmpneqEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func Cvtepi16Epi8(a x86.M128i) (dst x86.M128i)
func Cvtsepi16Epi8(a x86.M128i) (dst x86.M128i)
func Cvtusepi16Epi8(a x86.M128i) (dst x86.M128i)
func DbsadEpu8(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)
func M256CmpEpi16Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)
func M256CmpEpi8Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)
func M256CmpEpu16Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)
func M256CmpEpu8Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)
func M256CmpeqEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpeqEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpeqEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpeqEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpgeEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpgeEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpgeEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpgeEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpgtEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpgtEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpgtEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpgtEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpleEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpleEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpleEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpleEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpltEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpltEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpltEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpltEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpneqEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpneqEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256CmpneqEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256CmpneqEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256Cvtepi16Epi8(a x86.M256i) (dst x86.M128i)
func M256Cvtsepi16Epi8(a x86.M256i) (dst x86.M128i)
func M256Cvtusepi16Epi8(a x86.M256i) (dst x86.M128i)
func M256DbsadEpu8(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256Mask2Permutex2varEpi16(a x86.M256i, idx x86.M256i, k x86.Mmask16, b x86.M256i) (dst x86.M256i)
func M256MaskAbsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i) (dst x86.M256i)
func M256MaskAbsEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i) (dst x86.M256i)
func M256MaskAddEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAddEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAddsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAddsEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAddsEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAddsEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAlignrEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i, count int) (dst x86.M256i)
func M256MaskAvgEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskAvgEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskBlendEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskBlendEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskBroadcastbEpi8(src x86.M256i, k x86.Mmask32, a x86.M128i) (dst x86.M256i)
func M256MaskBroadcastwEpi16(src x86.M256i, k x86.Mmask16, a x86.M128i) (dst x86.M256i)
func M256MaskCmpEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)
func M256MaskCmpEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)
func M256MaskCmpEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)
func M256MaskCmpEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)
func M256MaskCmpeqEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpeqEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpeqEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpeqEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpgeEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpgeEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpgeEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpgeEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpgtEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpgtEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpgtEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpgtEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpleEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpleEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpleEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpleEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpltEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpltEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpltEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpltEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpneqEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpneqEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCmpneqEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskCmpneqEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskCvtepi16Epi8(src x86.M128i, k x86.Mmask16, a x86.M256i) (dst x86.M128i)
func M256MaskCvtepi8Epi16(src x86.M256i, k x86.Mmask16, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepu8Epi16(src x86.M256i, k x86.Mmask16, a x86.M128i) (dst x86.M256i)
func M256MaskCvtsepi16Epi8(src x86.M128i, k x86.Mmask16, a x86.M256i) (dst x86.M128i)
func M256MaskCvtusepi16Epi8(src x86.M128i, k x86.Mmask16, a x86.M256i) (dst x86.M128i)
func M256MaskDbsadEpu8(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskMaddEpi16(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMaddubsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMaxEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMaxEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMaxEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMaxEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMinEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMinEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMinEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMinEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMovEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i) (dst x86.M256i)
func M256MaskMovEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i) (dst x86.M256i)
func M256MaskMulhiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMulhiEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMulhrsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskMulloEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskPacksEpi16(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskPacksEpi32(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskPackusEpi16(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskPackusEpi32(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskPermutex2varEpi16(a x86.M256i, k x86.Mmask16, idx x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskPermutexvarEpi16(src x86.M256i, k x86.Mmask16, idx x86.M256i, a x86.M256i) (dst x86.M256i)
func M256MaskSet1Epi16(src x86.M256i, k x86.Mmask16, a int16) (dst x86.M256i)
func M256MaskSet1Epi8(src x86.M256i, k x86.Mmask32, a byte) (dst x86.M256i)
func M256MaskShuffleEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskShufflehiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskShuffleloEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskSllEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskSlliEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskSllvEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskSraEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskSraiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskSravEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskSrlEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskSrliEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskSrlvEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskSubEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskSubEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskSubsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskSubsEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskSubsEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskSubsEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskTestEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskTestEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskTestnEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256MaskTestnEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256MaskUnpackhiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskUnpackhiEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskUnpackloEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskUnpackloEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAbsEpi16(k x86.Mmask16, a x86.M256i) (dst x86.M256i)
func M256MaskzAbsEpi8(k x86.Mmask32, a x86.M256i) (dst x86.M256i)
func M256MaskzAddEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAddEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAddsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAddsEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAddsEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAddsEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAlignrEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i, count int) (dst x86.M256i)
func M256MaskzAvgEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzAvgEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzBroadcastbEpi8(k x86.Mmask32, a x86.M128i) (dst x86.M256i)
func M256MaskzBroadcastwEpi16(k x86.Mmask16, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepi16Epi8(k x86.Mmask16, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtepi8Epi16(k x86.Mmask16, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepu8Epi16(k x86.Mmask16, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtsepi16Epi8(k x86.Mmask16, a x86.M256i) (dst x86.M128i)
func M256MaskzCvtusepi16Epi8(k x86.Mmask16, a x86.M256i) (dst x86.M128i)
func M256MaskzDbsadEpu8(k x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzMaddEpi16(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMaddubsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMaxEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMaxEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMaxEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMaxEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMinEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMinEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMinEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMinEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMovEpi16(k x86.Mmask16, a x86.M256i) (dst x86.M256i)
func M256MaskzMovEpi8(k x86.Mmask32, a x86.M256i) (dst x86.M256i)
func M256MaskzMulhiEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMulhiEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMulhrsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzMulloEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzPacksEpi16(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzPacksEpi32(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzPackusEpi16(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzPackusEpi32(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzPermutex2varEpi16(k x86.Mmask16, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzPermutexvarEpi16(k x86.Mmask16, idx x86.M256i, a x86.M256i) (dst x86.M256i)
func M256MaskzSet1Epi16(k x86.Mmask16, a int16) (dst x86.M256i)
func M256MaskzSet1Epi8(k x86.Mmask32, a byte) (dst x86.M256i)
func M256MaskzShuffleEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzShufflehiEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzShuffleloEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzSllEpi16(k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskzSlliEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzSllvEpi16(k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskzSraEpi16(k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskzSraiEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzSravEpi16(k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskzSrlEpi16(k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256MaskzSrliEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)
func M256MaskzSrlvEpi16(k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256MaskzSubEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzSubEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzSubsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzSubsEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzSubsEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzSubsEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzUnpackhiEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzUnpackhiEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzUnpackloEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzUnpackloEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256Movepi16Mask(a x86.M256i) (dst x86.Mmask16)
func M256Movepi8Mask(a x86.M256i) (dst x86.Mmask32)
func M256MovmEpi16(k x86.Mmask16) (dst x86.M256i)
func M256MovmEpi8(k x86.Mmask32) (dst x86.M256i)
func M256Permutex2varEpi16(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)
func M256PermutexvarEpi16(idx x86.M256i, a x86.M256i) (dst x86.M256i)
func M256SllvEpi16(a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256SravEpi16(a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256SrlvEpi16(a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256TestEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256TestEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M256TestnEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)
func M256TestnEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)
func M512AbsEpi16(a x86.M512i) (dst x86.M512i)
func M512AbsEpi8(a x86.M512i) (dst x86.M512i)
func M512AddEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AddEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AddsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AddsEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AddsEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AddsEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AlignrEpi8(a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
func M512AvgEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AvgEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512BroadcastbEpi8(a x86.M128i) (dst x86.M512i)
func M512BroadcastwEpi16(a x86.M128i) (dst x86.M512i)
func M512BslliEpi128(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512BsrliEpi128(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512CmpEpi16Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)
func M512CmpEpi8Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)
func M512CmpEpu16Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)
func M512CmpEpu8Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)
func M512CmpeqEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpeqEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpeqEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpeqEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpgeEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpgeEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpgeEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpgeEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpgtEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpgtEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpgtEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpgtEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpleEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpleEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpleEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpleEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpltEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpltEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpltEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpltEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpneqEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpneqEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512CmpneqEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512CmpneqEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512Cvtepi16Epi8(a x86.M512i) (dst x86.M256i)
func M512Cvtepi8Epi16(a x86.M256i) (dst x86.M512i)
func M512Cvtepu8Epi16(a x86.M256i) (dst x86.M512i)
func M512Cvtsepi16Epi8(a x86.M512i) (dst x86.M256i)
func M512Cvtusepi16Epi8(a x86.M512i) (dst x86.M256i)
func M512DbsadEpu8(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512Kunpackd(a x86.Mmask64, b x86.Mmask64) (dst x86.Mmask64)
func M512Kunpackw(a x86.Mmask32, b x86.Mmask32) (dst x86.Mmask32)
func M512MaddEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaddubsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512Mask2Permutex2varEpi16(a x86.M512i, idx x86.M512i, k x86.Mmask32, b x86.M512i) (dst x86.M512i)
func M512MaskAbsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i) (dst x86.M512i)
func M512MaskAbsEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i) (dst x86.M512i)
func M512MaskAddEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAddEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAddsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAddsEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAddsEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAddsEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAlignrEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
func M512MaskAvgEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAvgEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskBlendEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskBlendEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskBroadcastbEpi8(src x86.M512i, k x86.Mmask64, a x86.M128i) (dst x86.M512i)
func M512MaskBroadcastwEpi16(src x86.M512i, k x86.Mmask32, a x86.M128i) (dst x86.M512i)
func M512MaskCmpEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)
func M512MaskCmpEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)
func M512MaskCmpEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)
func M512MaskCmpEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)
func M512MaskCmpeqEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpeqEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpeqEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpeqEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpgeEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpgeEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpgeEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpgeEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpgtEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpgtEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpgtEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpgtEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpleEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpleEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpleEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpleEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpltEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpltEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpltEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpltEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpneqEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpneqEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCmpneqEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskCmpneqEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskCvtepi16Epi8(src x86.M256i, k x86.Mmask32, a x86.M512i) (dst x86.M256i)
func M512MaskCvtepi8Epi16(src x86.M512i, k x86.Mmask32, a x86.M256i) (dst x86.M512i)
func M512MaskCvtepu8Epi16(src x86.M512i, k x86.Mmask32, a x86.M256i) (dst x86.M512i)
func M512MaskCvtsepi16Epi8(src x86.M256i, k x86.Mmask32, a x86.M512i) (dst x86.M256i)
func M512MaskCvtusepi16Epi8(src x86.M256i, k x86.Mmask32, a x86.M512i) (dst x86.M256i)
func M512MaskDbsadEpu8(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskMaddEpi16(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMaddubsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMaxEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMaxEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMaxEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMaxEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMinEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMinEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMinEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMinEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMovEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i) (dst x86.M512i)
func M512MaskMovEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i) (dst x86.M512i)
func M512MaskMulhiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMulhiEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMulhrsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMulloEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskPacksEpi16(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskPacksEpi32(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskPackusEpi16(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskPackusEpi32(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskPermutex2varEpi16(a x86.M512i, k x86.Mmask32, idx x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskPermutexvarEpi16(src x86.M512i, k x86.Mmask32, idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512MaskSet1Epi16(src x86.M512i, k x86.Mmask32, a int16) (dst x86.M512i)
func M512MaskSet1Epi8(src x86.M512i, k x86.Mmask64, a byte) (dst x86.M512i)
func M512MaskShuffleEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskShufflehiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskShuffleloEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSllEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskSlliEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSllvEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskSraEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskSraiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSravEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskSrlEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskSrliEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSrlvEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskSubEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskSubEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskSubsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskSubsEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskSubsEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskSubsEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskTestEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskTestEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskTestnEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512MaskTestnEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512MaskUnpackhiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskUnpackhiEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskUnpackloEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskUnpackloEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAbsEpi16(k x86.Mmask32, a x86.M512i) (dst x86.M512i)
func M512MaskzAbsEpi8(k x86.Mmask64, a x86.M512i) (dst x86.M512i)
func M512MaskzAddEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAddEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAddsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAddsEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAddsEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAddsEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAlignrEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
func M512MaskzAvgEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzAvgEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzBroadcastbEpi8(k x86.Mmask64, a x86.M128i) (dst x86.M512i)
func M512MaskzBroadcastwEpi16(k x86.Mmask32, a x86.M128i) (dst x86.M512i)
func M512MaskzCvtepi16Epi8(k x86.Mmask32, a x86.M512i) (dst x86.M256i)
func M512MaskzCvtepi8Epi16(k x86.Mmask32, a x86.M256i) (dst x86.M512i)
func M512MaskzCvtepu8Epi16(k x86.Mmask32, a x86.M256i) (dst x86.M512i)
func M512MaskzCvtsepi16Epi8(k x86.Mmask32, a x86.M512i) (dst x86.M256i)
func M512MaskzCvtusepi16Epi8(k x86.Mmask32, a x86.M512i) (dst x86.M256i)
func M512MaskzDbsadEpu8(k x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzMaddEpi16(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMaddubsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMaxEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMaxEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMaxEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMaxEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMinEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMinEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMinEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMinEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMovEpi16(k x86.Mmask32, a x86.M512i) (dst x86.M512i)
func M512MaskzMovEpi8(k x86.Mmask64, a x86.M512i) (dst x86.M512i)
func M512MaskzMulhiEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMulhiEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMulhrsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzMulloEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzPacksEpi16(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzPacksEpi32(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzPackusEpi16(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzPackusEpi32(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzPermutex2varEpi16(k x86.Mmask32, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzPermutexvarEpi16(k x86.Mmask32, idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512MaskzSet1Epi16(k x86.Mmask32, a int16) (dst x86.M512i)
func M512MaskzSet1Epi8(k x86.Mmask64, a byte) (dst x86.M512i)
func M512MaskzShuffleEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzShufflehiEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzShuffleloEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzSllEpi16(k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskzSlliEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzSllvEpi16(k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskzSraEpi16(k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskzSraiEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzSravEpi16(k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskzSrlEpi16(k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512MaskzSrliEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskzSrlvEpi16(k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskzSubEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzSubEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzSubsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzSubsEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzSubsEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzSubsEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzUnpackhiEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzUnpackhiEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzUnpackloEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzUnpackloEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaxEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaxEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaxEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaxEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MinEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MinEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MinEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MinEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512Movepi16Mask(a x86.M512i) (dst x86.Mmask32)
func M512Movepi8Mask(a x86.M512i) (dst x86.Mmask64)
func M512MovmEpi16(k x86.Mmask32) (dst x86.M512i)
func M512MovmEpi8(k x86.Mmask64) (dst x86.M512i)
func M512MulhiEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MulhiEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MulhrsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MulloEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512PacksEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512PacksEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512PackusEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512PackusEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512Permutex2varEpi16(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)
func M512PermutexvarEpi16(idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512SadEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512ShuffleEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512ShufflehiEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512ShuffleloEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SllEpi16(a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512SlliEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SllvEpi16(a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512SraEpi16(a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512SraiEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SravEpi16(a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512SrlEpi16(a x86.M512i, count x86.M128i) (dst x86.M512i)
func M512SrliEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SrlvEpi16(a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512SubEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512SubEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512SubsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512SubsEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512SubsEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512SubsEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512TestEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512TestEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512TestnEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)
func M512TestnEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)
func M512UnpackhiEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512UnpackhiEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512UnpackloEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512UnpackloEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)
func Mask2Permutex2varEpi16(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)
func MaskAbsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskAbsEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i) (dst x86.M128i)
func MaskAddEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAddEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAddsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAddsEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAddsEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAddsEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAlignrEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i, count int) (dst x86.M128i)
func MaskAvgEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskAvgEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskBlendEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskBlendEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskBroadcastbEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i) (dst x86.M128i)
func MaskBroadcastwEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCmpEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func MaskCmpEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)
func MaskCmpEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)
func MaskCmpEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)
func MaskCmpeqEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpeqEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpeqEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpeqEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpgeEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgeEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpgeEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgeEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpgtEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgtEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpgtEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpgtEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpleEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpleEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpleEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpleEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpltEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpltEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpltEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpltEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpneqEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpneqEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCmpneqEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskCmpneqEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskCvtepi16Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi8Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepu8Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtsepi16Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtusepi16Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskDbsadEpu8(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)
func MaskMaddEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMaddubsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMaxEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMaxEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMaxEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMaxEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMinEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMinEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMinEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMinEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMovEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskMovEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i) (dst x86.M128i)
func MaskMulhiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMulhiEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMulhrsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskMulloEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskPacksEpi16(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskPacksEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskPackusEpi16(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskPackusEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskPermutex2varEpi16(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskPermutexvarEpi16(src x86.M128i, k x86.Mmask8, idx x86.M128i, a x86.M128i) (dst x86.M128i)
func MaskSet1Epi16(src x86.M128i, k x86.Mmask8, a int16) (dst x86.M128i)
func MaskSet1Epi8(src x86.M128i, k x86.Mmask16, a byte) (dst x86.M128i)
func MaskShuffleEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskShufflehiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskShuffleloEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskSllEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSlliEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskSllvEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSraEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSraiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskSravEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSrlEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSrliEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskSrlvEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskSubEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskSubEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskSubsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskSubsEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskSubsEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskSubsEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskTestEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskTestEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskTestnEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func MaskTestnEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func MaskUnpackhiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskUnpackhiEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskUnpackloEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskUnpackloEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAbsEpi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzAbsEpi8(k x86.Mmask16, a x86.M128i) (dst x86.M128i)
func MaskzAddEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAddEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAddsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAddsEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAddsEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAddsEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAlignrEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i, count int) (dst x86.M128i)
func MaskzAvgEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzAvgEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzBroadcastbEpi8(k x86.Mmask16, a x86.M128i) (dst x86.M128i)
func MaskzBroadcastwEpi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi16Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi8Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepu8Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtsepi16Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtusepi16Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzDbsadEpu8(k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzMaddEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMaddubsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMaxEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMaxEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMaxEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMaxEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMinEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMinEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMinEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMinEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMovEpi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzMovEpi8(k x86.Mmask16, a x86.M128i) (dst x86.M128i)
func MaskzMulhiEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMulhiEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMulhrsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzMulloEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzPacksEpi16(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzPacksEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzPackusEpi16(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzPackusEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzPermutex2varEpi16(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzPermutexvarEpi16(k x86.Mmask8, idx x86.M128i, a x86.M128i) (dst x86.M128i)
func MaskzSet1Epi16(k x86.Mmask8, a int16) (dst x86.M128i)
func MaskzSet1Epi8(k x86.Mmask16, a byte) (dst x86.M128i)
func MaskzShuffleEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzShufflehiEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzShuffleloEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzSllEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSlliEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzSllvEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSraEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSraiEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzSravEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSrlEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSrliEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)
func MaskzSrlvEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)
func MaskzSubEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzSubEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzSubsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzSubsEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzSubsEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzSubsEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzUnpackhiEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzUnpackhiEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzUnpackloEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzUnpackloEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)
func Movepi16Mask(a x86.M128i) (dst x86.Mmask8)
func Movepi8Mask(a x86.M128i) (dst x86.Mmask16)
func MovmEpi16(k x86.Mmask8) (dst x86.M128i)
func Permutex2varEpi16(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)
func PermutexvarEpi16(idx x86.M128i, a x86.M128i) (dst x86.M128i)
func SllvEpi16(a x86.M128i, count x86.M128i) (dst x86.M128i)
func SravEpi16(a x86.M128i, count x86.M128i) (dst x86.M128i)
func SrlvEpi16(a x86.M128i, count x86.M128i) (dst x86.M128i)
func TestEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func TestEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)
func TestnEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)
func TestnEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func CmpEpi16Mask ¶

func CmpEpi16Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func CmpEpi8Mask ¶

func CmpEpi8Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)

CmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func CmpEpu16Mask ¶

func CmpEpu16Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func CmpEpu8Mask ¶

func CmpEpu8Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)

CmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func CmpeqEpi16Mask ¶

func CmpeqEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmpeq_epi16_mask'. Requires AVX512BW.

func CmpeqEpi8Mask ¶

func CmpeqEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmpeq_epi8_mask'. Requires AVX512BW.

func CmpeqEpu16Mask ¶

func CmpeqEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmpeq_epu16_mask'. Requires AVX512BW.

func CmpeqEpu8Mask ¶

func CmpeqEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmpeq_epu8_mask'. Requires AVX512BW.

func CmpgeEpi16Mask ¶

func CmpgeEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmpge_epi16_mask'. Requires AVX512BW.

func CmpgeEpi8Mask ¶

func CmpgeEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmpge_epi8_mask'. Requires AVX512BW.

func CmpgeEpu16Mask ¶

func CmpgeEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmpge_epu16_mask'. Requires AVX512BW.

func CmpgeEpu8Mask ¶

func CmpgeEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmpge_epu8_mask'. Requires AVX512BW.

func CmpgtEpi16Mask ¶

func CmpgtEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmpgt_epi16_mask'. Requires AVX512BW.

func CmpgtEpi8Mask ¶

func CmpgtEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmpgt_epi8_mask'. Requires AVX512BW.

func CmpgtEpu16Mask ¶

func CmpgtEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmpgt_epu16_mask'. Requires AVX512BW.

func CmpgtEpu8Mask ¶

func CmpgtEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmpgt_epu8_mask'. Requires AVX512BW.

func CmpleEpi16Mask ¶

func CmpleEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmple_epi16_mask'. Requires AVX512BW.

func CmpleEpi8Mask ¶

func CmpleEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmple_epi8_mask'. Requires AVX512BW.

func CmpleEpu16Mask ¶

func CmpleEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmple_epu16_mask'. Requires AVX512BW.

func CmpleEpu8Mask ¶

func CmpleEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmple_epu8_mask'. Requires AVX512BW.

func CmpltEpi16Mask ¶

func CmpltEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmplt_epi16_mask'. Requires AVX512BW.

func CmpltEpi8Mask ¶

func CmpltEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmplt_epi8_mask'. Requires AVX512BW.

func CmpltEpu16Mask ¶

func CmpltEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmplt_epu16_mask'. Requires AVX512BW.

func CmpltEpu8Mask ¶

func CmpltEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmplt_epu8_mask'. Requires AVX512BW.

func CmpneqEpi16Mask ¶

func CmpneqEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmpneq_epi16_mask'. Requires AVX512BW.

func CmpneqEpi8Mask ¶

func CmpneqEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmpneq_epi8_mask'. Requires AVX512BW.

func CmpneqEpu16Mask ¶

func CmpneqEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmpneq_epu16_mask'. Requires AVX512BW.

func CmpneqEpu8Mask ¶

func CmpneqEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmpneq_epu8_mask'. Requires AVX512BW.

func Cvtepi16Epi8 ¶

func Cvtepi16Epi8(a x86.M128i) (dst x86.M128i)

Cvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm_cvtepi16_epi8'. Requires AVX512BW.

func Cvtsepi16Epi8 ¶

func Cvtsepi16Epi8(a x86.M128i) (dst x86.M128i)

Cvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm_cvtsepi16_epi8'. Requires AVX512BW.

func Cvtusepi16Epi8 ¶

func Cvtusepi16Epi8(a x86.M128i) (dst x86.M128i)

Cvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm_cvtusepi16_epi8'. Requires AVX512BW.

func DbsadEpu8 ¶

func DbsadEpu8(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

DbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst'.

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

tmp[31:0] := select(b[127:0], imm8[1:0])
tmp[63:32] := select(b[127:0], imm8[3:2])
tmp[95:64] := select(b[127:0], imm8[5:4])
tmp[127:96] := select(b[127:0], imm8[7:6])

FOR j := 0 to 1
	i := j*64
	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

dst[MAX:128] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256CmpEpi16Mask ¶

func M256CmpEpi16Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)

M256CmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256CmpEpi8Mask ¶

func M256CmpEpi8Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)

M256CmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256CmpEpu16Mask ¶

func M256CmpEpu16Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)

M256CmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256CmpEpu8Mask ¶

func M256CmpEpu8Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)

M256CmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256CmpeqEpi16Mask ¶

func M256CmpeqEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmpeq_epi16_mask'. Requires AVX512BW.

func M256CmpeqEpi8Mask ¶

func M256CmpeqEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmpeq_epi8_mask'. Requires AVX512BW.

func M256CmpeqEpu16Mask ¶

func M256CmpeqEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmpeq_epu16_mask'. Requires AVX512BW.

func M256CmpeqEpu8Mask ¶

func M256CmpeqEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmpeq_epu8_mask'. Requires AVX512BW.

func M256CmpgeEpi16Mask ¶

func M256CmpgeEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmpge_epi16_mask'. Requires AVX512BW.

func M256CmpgeEpi8Mask ¶

func M256CmpgeEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmpge_epi8_mask'. Requires AVX512BW.

func M256CmpgeEpu16Mask ¶

func M256CmpgeEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmpge_epu16_mask'. Requires AVX512BW.

func M256CmpgeEpu8Mask ¶

func M256CmpgeEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmpge_epu8_mask'. Requires AVX512BW.

func M256CmpgtEpi16Mask ¶

func M256CmpgtEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmpgt_epi16_mask'. Requires AVX512BW.

func M256CmpgtEpi8Mask ¶

func M256CmpgtEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmpgt_epi8_mask'. Requires AVX512BW.

func M256CmpgtEpu16Mask ¶

func M256CmpgtEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmpgt_epu16_mask'. Requires AVX512BW.

func M256CmpgtEpu8Mask ¶

func M256CmpgtEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmpgt_epu8_mask'. Requires AVX512BW.

func M256CmpleEpi16Mask ¶

func M256CmpleEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmple_epi16_mask'. Requires AVX512BW.

func M256CmpleEpi8Mask ¶

func M256CmpleEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmple_epi8_mask'. Requires AVX512BW.

func M256CmpleEpu16Mask ¶

func M256CmpleEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmple_epu16_mask'. Requires AVX512BW.

func M256CmpleEpu8Mask ¶

func M256CmpleEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmple_epu8_mask'. Requires AVX512BW.

func M256CmpltEpi16Mask ¶

func M256CmpltEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmplt_epi16_mask'. Requires AVX512BW.

func M256CmpltEpi8Mask ¶

func M256CmpltEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmplt_epi8_mask'. Requires AVX512BW.

func M256CmpltEpu16Mask ¶

func M256CmpltEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmplt_epu16_mask'. Requires AVX512BW.

func M256CmpltEpu8Mask ¶

func M256CmpltEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmplt_epu8_mask'. Requires AVX512BW.

func M256CmpneqEpi16Mask ¶

func M256CmpneqEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmpneq_epi16_mask'. Requires AVX512BW.

func M256CmpneqEpi8Mask ¶

func M256CmpneqEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmpneq_epi8_mask'. Requires AVX512BW.

func M256CmpneqEpu16Mask ¶

func M256CmpneqEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmpneq_epu16_mask'. Requires AVX512BW.

func M256CmpneqEpu8Mask ¶

func M256CmpneqEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmpneq_epu8_mask'. Requires AVX512BW.

func M256Cvtepi16Epi8 ¶

func M256Cvtepi16Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm256_cvtepi16_epi8'. Requires AVX512BW.

func M256Cvtsepi16Epi8 ¶

func M256Cvtsepi16Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm256_cvtsepi16_epi8'. Requires AVX512BW.

func M256Cvtusepi16Epi8 ¶

func M256Cvtusepi16Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm256_cvtusepi16_epi8'. Requires AVX512BW.

func M256DbsadEpu8 ¶

func M256DbsadEpu8(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256DbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst'.

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 1
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 3
	i := j*64
	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm256_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256Mask2Permutex2varEpi16 ¶

func M256Mask2Permutex2varEpi16(a x86.M256i, idx x86.M256i, k x86.Mmask16, b x86.M256i) (dst x86.M256i)

M256Mask2Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		off := 16*idx[i+3:i]
		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := idx[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2W'. Intrinsic: '_mm256_mask2_permutex2var_epi16'. Requires AVX512BW.

func M256MaskAbsEpi16 ¶

func M256MaskAbsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i) (dst x86.M256i)

M256MaskAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm256_mask_abs_epi16'. Requires AVX512BW.

func M256MaskAbsEpi8 ¶

func M256MaskAbsEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i) (dst x86.M256i)

M256MaskAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm256_mask_abs_epi8'. Requires AVX512BW.

func M256MaskAddEpi16 ¶

func M256MaskAddEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm256_mask_add_epi16'. Requires AVX512BW.

func M256MaskAddEpi8 ¶

func M256MaskAddEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm256_mask_add_epi8'. Requires AVX512BW.

func M256MaskAddsEpi16 ¶

func M256MaskAddsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm256_mask_adds_epi16'. Requires AVX512BW.

func M256MaskAddsEpi8 ¶

func M256MaskAddsEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm256_mask_adds_epi8'. Requires AVX512BW.

func M256MaskAddsEpu16 ¶

func M256MaskAddsEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm256_mask_adds_epu16'. Requires AVX512BW.

func M256MaskAddsEpu8 ¶

func M256MaskAddsEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm256_mask_adds_epu8'. Requires AVX512BW.

func M256MaskAlignrEpi8 ¶

func M256MaskAlignrEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i, count int) (dst x86.M256i)

M256MaskAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	tmp_dst[i+127:i] := tmp[127:0]
ENDFOR

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm256_mask_alignr_epi8'. Requires AVX512BW.

func M256MaskAvgEpu16 ¶

func M256MaskAvgEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm256_mask_avg_epu16'. Requires AVX512BW.

func M256MaskAvgEpu8 ¶

func M256MaskAvgEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm256_mask_avg_epu8'. Requires AVX512BW.

func M256MaskBlendEpi16 ¶

func M256MaskBlendEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskBlendEpi16: Blend packed 16-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := b[i+15:i]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDMW'. Intrinsic: '_mm256_mask_blend_epi16'. Requires AVX512BW.

func M256MaskBlendEpi8 ¶

func M256MaskBlendEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskBlendEpi8: Blend packed 8-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := b[i+7:i]
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDMB'. Intrinsic: '_mm256_mask_blend_epi8'. Requires AVX512BW.

func M256MaskBroadcastbEpi8 ¶

func M256MaskBroadcastbEpi8(src x86.M256i, k x86.Mmask32, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm256_mask_broadcastb_epi8'. Requires AVX512BW.

func M256MaskBroadcastwEpi16 ¶

func M256MaskBroadcastwEpi16(src x86.M256i, k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm256_mask_broadcastw_epi16'. Requires AVX512BW.

func M256MaskCmpEpi16Mask ¶

func M256MaskCmpEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)

M256MaskCmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpi8Mask ¶

func M256MaskCmpEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)

M256MaskCmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpu16Mask ¶

func M256MaskCmpEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)

M256MaskCmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpu8Mask ¶

func M256MaskCmpEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)

M256MaskCmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpeqEpi16Mask ¶

func M256MaskCmpeqEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmpeq_epi16_mask'. Requires AVX512BW.

func M256MaskCmpeqEpi8Mask ¶

func M256MaskCmpeqEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmpeq_epi8_mask'. Requires AVX512BW.

func M256MaskCmpeqEpu16Mask ¶

func M256MaskCmpeqEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmpeq_epu16_mask'. Requires AVX512BW.

func M256MaskCmpeqEpu8Mask ¶

func M256MaskCmpeqEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmpeq_epu8_mask'. Requires AVX512BW.

func M256MaskCmpgeEpi16Mask ¶

func M256MaskCmpgeEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmpge_epi16_mask'. Requires AVX512BW.

func M256MaskCmpgeEpi8Mask ¶

func M256MaskCmpgeEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmpge_epi8_mask'. Requires AVX512BW.

func M256MaskCmpgeEpu16Mask ¶

func M256MaskCmpgeEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmpge_epu16_mask'. Requires AVX512BW.

func M256MaskCmpgeEpu8Mask ¶

func M256MaskCmpgeEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmpge_epu8_mask'. Requires AVX512BW.

func M256MaskCmpgtEpi16Mask ¶

func M256MaskCmpgtEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmpgt_epi16_mask'. Requires AVX512BW.

func M256MaskCmpgtEpi8Mask ¶

func M256MaskCmpgtEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmpgt_epi8_mask'. Requires AVX512BW.

func M256MaskCmpgtEpu16Mask ¶

func M256MaskCmpgtEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmpgt_epu16_mask'. Requires AVX512BW.

func M256MaskCmpgtEpu8Mask ¶

func M256MaskCmpgtEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmpgt_epu8_mask'. Requires AVX512BW.

func M256MaskCmpleEpi16Mask ¶

func M256MaskCmpleEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmple_epi16_mask'. Requires AVX512BW.

func M256MaskCmpleEpi8Mask ¶

func M256MaskCmpleEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmple_epi8_mask'. Requires AVX512BW.

func M256MaskCmpleEpu16Mask ¶

func M256MaskCmpleEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
			k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmple_epu16_mask'. Requires AVX512BW.

func M256MaskCmpleEpu8Mask ¶

func M256MaskCmpleEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmple_epu8_mask'. Requires AVX512BW.

func M256MaskCmpltEpi16Mask ¶

func M256MaskCmpltEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmplt_epi16_mask'. Requires AVX512BW.

func M256MaskCmpltEpi8Mask ¶

func M256MaskCmpltEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmplt_epi8_mask'. Requires AVX512BW.

func M256MaskCmpltEpu16Mask ¶

func M256MaskCmpltEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmplt_epu16_mask'. Requires AVX512BW.

func M256MaskCmpltEpu8Mask ¶

func M256MaskCmpltEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmplt_epu8_mask'. Requires AVX512BW.

func M256MaskCmpneqEpi16Mask ¶

func M256MaskCmpneqEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmpneq_epi16_mask'. Requires AVX512BW.

func M256MaskCmpneqEpi8Mask ¶

func M256MaskCmpneqEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmpneq_epi8_mask'. Requires AVX512BW.

func M256MaskCmpneqEpu16Mask ¶

func M256MaskCmpneqEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmpneq_epu16_mask'. Requires AVX512BW.

func M256MaskCmpneqEpu8Mask ¶

func M256MaskCmpneqEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmpneq_epu8_mask'. Requires AVX512BW.

func M256MaskCvtepi16Epi8 ¶

func M256MaskCvtepi16Epi8(src x86.M128i, k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm256_mask_cvtepi16_epi8'. Requires AVX512BW.

func M256MaskCvtepi8Epi16 ¶

func M256MaskCvtepi8Epi16(src x86.M256i, k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm256_mask_cvtepi8_epi16'. Requires AVX512BW.

func M256MaskCvtepu8Epi16 ¶

func M256MaskCvtepu8Epi16(src x86.M256i, k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm256_mask_cvtepu8_epi16'. Requires AVX512BW.

func M256MaskCvtsepi16Epi8 ¶

func M256MaskCvtsepi16Epi8(src x86.M128i, k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm256_mask_cvtsepi16_epi8'. Requires AVX512BW.

func M256MaskCvtusepi16Epi8 ¶

func M256MaskCvtusepi16Epi8(src x86.M128i, k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm256_mask_cvtusepi16_epi8'. Requires AVX512BW.

func M256MaskDbsadEpu8 ¶

func M256MaskDbsadEpu8(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 1
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 3
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm256_mask_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskMaddEpi16 ¶

func M256MaskMaddEpi16(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm256_mask_madd_epi16'. Requires AVX512BW.

func M256MaskMaddubsEpi16 ¶

func M256MaskMaddubsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm256_mask_maddubs_epi16'. Requires AVX512BW.

func M256MaskMaxEpi16 ¶

func M256MaskMaxEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm256_mask_max_epi16'. Requires AVX512BW.

func M256MaskMaxEpi8 ¶

func M256MaskMaxEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm256_mask_max_epi8'. Requires AVX512BW.

func M256MaskMaxEpu16 ¶

func M256MaskMaxEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm256_mask_max_epu16'. Requires AVX512BW.

func M256MaskMaxEpu8 ¶

func M256MaskMaxEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm256_mask_max_epu8'. Requires AVX512BW.

func M256MaskMinEpi16 ¶

func M256MaskMinEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm256_mask_min_epi16'. Requires AVX512BW.

func M256MaskMinEpi8 ¶

func M256MaskMinEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm256_mask_min_epi8'. Requires AVX512BW.

func M256MaskMinEpu16 ¶

func M256MaskMinEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm256_mask_min_epu16'. Requires AVX512BW.

func M256MaskMinEpu8 ¶

func M256MaskMinEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm256_mask_min_epu8'. Requires AVX512BW.

func M256MaskMovEpi16 ¶

func M256MaskMovEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i) (dst x86.M256i)

M256MaskMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm256_mask_mov_epi16'. Requires AVX512BW.

func M256MaskMovEpi8 ¶

func M256MaskMovEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i) (dst x86.M256i)

M256MaskMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm256_mask_mov_epi8'. Requires AVX512BW.

func M256MaskMulhiEpi16 ¶

func M256MaskMulhiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm256_mask_mulhi_epi16'. Requires AVX512BW.

func M256MaskMulhiEpu16 ¶

func M256MaskMulhiEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm256_mask_mulhi_epu16'. Requires AVX512BW.

func M256MaskMulhrsEpi16 ¶

func M256MaskMulhrsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm256_mask_mulhrs_epi16'. Requires AVX512BW.

func M256MaskMulloEpi16 ¶

func M256MaskMulloEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm256_mask_mullo_epi16'. Requires AVX512BW.

func M256MaskPacksEpi16 ¶

func M256MaskPacksEpi16(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm256_mask_packs_epi16'. Requires AVX512BW.

func M256MaskPacksEpi32 ¶

func M256MaskPacksEpi32(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm256_mask_packs_epi32'. Requires AVX512BW.

func M256MaskPackusEpi16 ¶

func M256MaskPackusEpi16(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm256_mask_packus_epi16'. Requires AVX512BW.

func M256MaskPackusEpi32 ¶

func M256MaskPackusEpi32(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm256_mask_packus_epi32'. Requires AVX512BW.

func M256MaskPermutex2varEpi16 ¶

func M256MaskPermutex2varEpi16(a x86.M256i, k x86.Mmask16, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		off := 16*idx[i+3:i]
		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMT2W'. Intrinsic: '_mm256_mask_permutex2var_epi16'. Requires AVX512BW.

func M256MaskPermutexvarEpi16 ¶

func M256MaskPermutexvarEpi16(src x86.M256i, k x86.Mmask16, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskPermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	id := idx[i+3:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm256_mask_permutexvar_epi16'. Requires AVX512BW.

func M256MaskSet1Epi16 ¶

func M256MaskSet1Epi16(src x86.M256i, k x86.Mmask16, a int16) (dst x86.M256i)

M256MaskSet1Epi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm256_mask_set1_epi16'. Requires AVX512BW.

func M256MaskSet1Epi8 ¶

func M256MaskSet1Epi8(src x86.M256i, k x86.Mmask32, a byte) (dst x86.M256i)

M256MaskSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm256_mask_set1_epi8'. Requires AVX512BW.

func M256MaskShuffleEpi8 ¶

func M256MaskShuffleEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm256_mask_shuffle_epi8'. Requires AVX512BW.

func M256MaskShufflehiEpi16 ¶

func M256MaskShufflehiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm256_mask_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskShuffleloEpi16 ¶

func M256MaskShuffleloEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm256_mask_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskSllEpi16 ¶

func M256MaskSllEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm256_mask_sll_epi16'. Requires AVX512BW.

func M256MaskSlliEpi16 ¶

func M256MaskSlliEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm256_mask_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskSllvEpi16 ¶

func M256MaskSllvEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm256_mask_sllv_epi16'. Requires AVX512BW.

func M256MaskSraEpi16 ¶

func M256MaskSraEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm256_mask_sra_epi16'. Requires AVX512BW.

func M256MaskSraiEpi16 ¶

func M256MaskSraiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm256_mask_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskSravEpi16 ¶

func M256MaskSravEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm256_mask_srav_epi16'. Requires AVX512BW.

func M256MaskSrlEpi16 ¶

func M256MaskSrlEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm256_mask_srl_epi16'. Requires AVX512BW.

func M256MaskSrliEpi16 ¶

func M256MaskSrliEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm256_mask_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskSrlvEpi16 ¶

func M256MaskSrlvEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+63:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm256_mask_srlv_epi16'. Requires AVX512BW.

func M256MaskSubEpi16 ¶

func M256MaskSubEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm256_mask_sub_epi16'. Requires AVX512BW.

func M256MaskSubEpi8 ¶

func M256MaskSubEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm256_mask_sub_epi8'. Requires AVX512BW.

func M256MaskSubsEpi16 ¶

func M256MaskSubsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm256_mask_subs_epi16'. Requires AVX512BW.

func M256MaskSubsEpi8 ¶

func M256MaskSubsEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm256_mask_subs_epi8'. Requires AVX512BW.

func M256MaskSubsEpu16 ¶

func M256MaskSubsEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm256_mask_subs_epu16'. Requires AVX512BW.

func M256MaskSubsEpu8 ¶

func M256MaskSubsEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm256_mask_subs_epu8'. Requires AVX512BW.

func M256MaskTestEpi16Mask ¶

func M256MaskTestEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskTestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm256_mask_test_epi16_mask'. Requires AVX512BW.

func M256MaskTestEpi8Mask ¶

func M256MaskTestEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskTestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm256_mask_test_epi8_mask'. Requires AVX512BW.

func M256MaskTestnEpi16Mask ¶

func M256MaskTestnEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskTestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm256_mask_testn_epi16_mask'. Requires AVX512BW.

func M256MaskTestnEpi8Mask ¶

func M256MaskTestnEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskTestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm256_mask_testn_epi8_mask'. Requires AVX512BW.

func M256MaskUnpackhiEpi16 ¶

func M256MaskUnpackhiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm256_mask_unpackhi_epi16'. Requires AVX512BW.

func M256MaskUnpackhiEpi8 ¶

func M256MaskUnpackhiEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm256_mask_unpackhi_epi8'. Requires AVX512BW.

func M256MaskUnpackloEpi16 ¶

func M256MaskUnpackloEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm256_mask_unpacklo_epi16'. Requires AVX512BW.

func M256MaskUnpackloEpi8 ¶

func M256MaskUnpackloEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm256_mask_unpacklo_epi8'. Requires AVX512BW.

func M256MaskzAbsEpi16 ¶

func M256MaskzAbsEpi16(k x86.Mmask16, a x86.M256i) (dst x86.M256i)

M256MaskzAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm256_maskz_abs_epi16'. Requires AVX512BW.

func M256MaskzAbsEpi8 ¶

func M256MaskzAbsEpi8(k x86.Mmask32, a x86.M256i) (dst x86.M256i)

M256MaskzAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm256_maskz_abs_epi8'. Requires AVX512BW.

func M256MaskzAddEpi16 ¶

func M256MaskzAddEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm256_maskz_add_epi16'. Requires AVX512BW.

func M256MaskzAddEpi8 ¶

func M256MaskzAddEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm256_maskz_add_epi8'. Requires AVX512BW.

func M256MaskzAddsEpi16 ¶

func M256MaskzAddsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm256_maskz_adds_epi16'. Requires AVX512BW.

func M256MaskzAddsEpi8 ¶

func M256MaskzAddsEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm256_maskz_adds_epi8'. Requires AVX512BW.

func M256MaskzAddsEpu16 ¶

func M256MaskzAddsEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm256_maskz_adds_epu16'. Requires AVX512BW.

func M256MaskzAddsEpu8 ¶

func M256MaskzAddsEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm256_maskz_adds_epu8'. Requires AVX512BW.

func M256MaskzAlignrEpi8 ¶

func M256MaskzAlignrEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i, count int) (dst x86.M256i)

M256MaskzAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	tmp_dst[i+127:i] := tmp[127:0]
ENDFOR

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm256_maskz_alignr_epi8'. Requires AVX512BW.

func M256MaskzAvgEpu16 ¶

func M256MaskzAvgEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm256_maskz_avg_epu16'. Requires AVX512BW.

func M256MaskzAvgEpu8 ¶

func M256MaskzAvgEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm256_maskz_avg_epu8'. Requires AVX512BW.

func M256MaskzBroadcastbEpi8 ¶

func M256MaskzBroadcastbEpi8(k x86.Mmask32, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm256_maskz_broadcastb_epi8'. Requires AVX512BW.

func M256MaskzBroadcastwEpi16 ¶

func M256MaskzBroadcastwEpi16(k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm256_maskz_broadcastw_epi16'. Requires AVX512BW.

func M256MaskzCvtepi16Epi8 ¶

func M256MaskzCvtepi16Epi8(k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm256_maskz_cvtepi16_epi8'. Requires AVX512BW.

func M256MaskzCvtepi8Epi16 ¶

func M256MaskzCvtepi8Epi16(k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm256_maskz_cvtepi8_epi16'. Requires AVX512BW.

func M256MaskzCvtepu8Epi16 ¶

func M256MaskzCvtepu8Epi16(k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm256_maskz_cvtepu8_epi16'. Requires AVX512BW.

func M256MaskzCvtsepi16Epi8 ¶

func M256MaskzCvtsepi16Epi8(k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm256_maskz_cvtsepi16_epi8'. Requires AVX512BW.

func M256MaskzCvtusepi16Epi8 ¶

func M256MaskzCvtusepi16Epi8(k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm256_maskz_cvtusepi16_epi8'. Requires AVX512BW.

func M256MaskzDbsadEpu8 ¶

func M256MaskzDbsadEpu8(k x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 1
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 3
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm256_maskz_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzMaddEpi16 ¶

func M256MaskzMaddEpi16(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm256_maskz_madd_epi16'. Requires AVX512BW.

func M256MaskzMaddubsEpi16 ¶

func M256MaskzMaddubsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm256_maskz_maddubs_epi16'. Requires AVX512BW.

func M256MaskzMaxEpi16 ¶

func M256MaskzMaxEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm256_maskz_max_epi16'. Requires AVX512BW.

func M256MaskzMaxEpi8 ¶

func M256MaskzMaxEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm256_maskz_max_epi8'. Requires AVX512BW.

func M256MaskzMaxEpu16 ¶

func M256MaskzMaxEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm256_maskz_max_epu16'. Requires AVX512BW.

func M256MaskzMaxEpu8 ¶

func M256MaskzMaxEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm256_maskz_max_epu8'. Requires AVX512BW.

func M256MaskzMinEpi16 ¶

func M256MaskzMinEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm256_maskz_min_epi16'. Requires AVX512BW.

func M256MaskzMinEpi8 ¶

func M256MaskzMinEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm256_maskz_min_epi8'. Requires AVX512BW.

func M256MaskzMinEpu16 ¶

func M256MaskzMinEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm256_maskz_min_epu16'. Requires AVX512BW.

func M256MaskzMinEpu8 ¶

func M256MaskzMinEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm256_maskz_min_epu8'. Requires AVX512BW.

func M256MaskzMovEpi16 ¶

func M256MaskzMovEpi16(k x86.Mmask16, a x86.M256i) (dst x86.M256i)

M256MaskzMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm256_maskz_mov_epi16'. Requires AVX512BW.

func M256MaskzMovEpi8 ¶

func M256MaskzMovEpi8(k x86.Mmask32, a x86.M256i) (dst x86.M256i)

M256MaskzMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm256_maskz_mov_epi8'. Requires AVX512BW.

func M256MaskzMulhiEpi16 ¶

func M256MaskzMulhiEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm256_maskz_mulhi_epi16'. Requires AVX512BW.

func M256MaskzMulhiEpu16 ¶

func M256MaskzMulhiEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := o
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm256_maskz_mulhi_epu16'. Requires AVX512BW.

func M256MaskzMulhrsEpi16 ¶

func M256MaskzMulhrsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := 9
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm256_maskz_mulhrs_epi16'. Requires AVX512BW.

func M256MaskzMulloEpi16 ¶

func M256MaskzMulloEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm256_maskz_mullo_epi16'. Requires AVX512BW.

func M256MaskzPacksEpi16 ¶

func M256MaskzPacksEpi16(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm256_maskz_packs_epi16'. Requires AVX512BW.

func M256MaskzPacksEpi32 ¶

func M256MaskzPacksEpi32(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm256_maskz_packs_epi32'. Requires AVX512BW.

func M256MaskzPackusEpi16 ¶

func M256MaskzPackusEpi16(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm256_maskz_packus_epi16'. Requires AVX512BW.

func M256MaskzPackusEpi32 ¶

func M256MaskzPackusEpi32(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm256_maskz_packus_epi32'. Requires AVX512BW.

func M256MaskzPermutex2varEpi16 ¶

func M256MaskzPermutex2varEpi16(k x86.Mmask16, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		off := 16*idx[i+3:i]
		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm256_maskz_permutex2var_epi16'. Requires AVX512BW.

func M256MaskzPermutexvarEpi16 ¶

func M256MaskzPermutexvarEpi16(k x86.Mmask16, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskzPermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	id := idx[i+3:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm256_maskz_permutexvar_epi16'. Requires AVX512BW.

func M256MaskzSet1Epi16 ¶

func M256MaskzSet1Epi16(k x86.Mmask16, a int16) (dst x86.M256i)

M256MaskzSet1Epi16: Broadcast 16-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm256_maskz_set1_epi16'. Requires AVX512BW.

func M256MaskzSet1Epi8 ¶

func M256MaskzSet1Epi8(k x86.Mmask32, a byte) (dst x86.M256i)

M256MaskzSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm256_maskz_set1_epi8'. Requires AVX512BW.

func M256MaskzShuffleEpi8 ¶

func M256MaskzShuffleEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm256_maskz_shuffle_epi8'. Requires AVX512BW.

func M256MaskzShufflehiEpi16 ¶

func M256MaskzShufflehiEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm256_maskz_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzShuffleloEpi16 ¶

func M256MaskzShuffleloEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm256_maskz_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzSllEpi16 ¶

func M256MaskzSllEpi16(k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm256_maskz_sll_epi16'. Requires AVX512BW.

func M256MaskzSlliEpi16 ¶

func M256MaskzSlliEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm256_maskz_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzSllvEpi16 ¶

func M256MaskzSllvEpi16(k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm256_maskz_sllv_epi16'. Requires AVX512BW.

func M256MaskzSraEpi16 ¶

func M256MaskzSraEpi16(k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm256_maskz_sra_epi16'. Requires AVX512BW.

func M256MaskzSraiEpi16 ¶

func M256MaskzSraiEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm256_maskz_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzSravEpi16 ¶

func M256MaskzSravEpi16(k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm256_maskz_srav_epi16'. Requires AVX512BW.

func M256MaskzSrlEpi16 ¶

func M256MaskzSrlEpi16(k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm256_maskz_srl_epi16'. Requires AVX512BW.

func M256MaskzSrliEpi16 ¶

func M256MaskzSrliEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm256_maskz_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzSrlvEpi16 ¶

func M256MaskzSrlvEpi16(k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm256_maskz_srlv_epi16'. Requires AVX512BW.

func M256MaskzSubEpi16 ¶

func M256MaskzSubEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm256_maskz_sub_epi16'. Requires AVX512BW.

func M256MaskzSubEpi8 ¶

func M256MaskzSubEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm256_maskz_sub_epi8'. Requires AVX512BW.

func M256MaskzSubsEpi16 ¶

func M256MaskzSubsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm256_maskz_subs_epi16'. Requires AVX512BW.

func M256MaskzSubsEpi8 ¶

func M256MaskzSubsEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm256_maskz_subs_epi8'. Requires AVX512BW.

func M256MaskzSubsEpu16 ¶

func M256MaskzSubsEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm256_maskz_subs_epu16'. Requires AVX512BW.

func M256MaskzSubsEpu8 ¶

func M256MaskzSubsEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm256_maskz_subs_epu8'. Requires AVX512BW.

func M256MaskzUnpackhiEpi16 ¶

func M256MaskzUnpackhiEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm256_maskz_unpackhi_epi16'. Requires AVX512BW.

func M256MaskzUnpackhiEpi8 ¶

func M256MaskzUnpackhiEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm256_maskz_unpackhi_epi8'. Requires AVX512BW.

func M256MaskzUnpackloEpi16 ¶

func M256MaskzUnpackloEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm256_maskz_unpacklo_epi16'. Requires AVX512BW.

func M256MaskzUnpackloEpi8 ¶

func M256MaskzUnpackloEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm256_maskz_unpacklo_epi8'. Requires AVX512BW.

func M256Movepi16Mask ¶

func M256Movepi16Mask(a x86.M256i) (dst x86.Mmask16)

M256Movepi16Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 16-bit integer in 'a'.

FOR j := 0 to 15
	i := j*16
	IF a[i+15]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPMOVW2M'. Intrinsic: '_mm256_movepi16_mask'. Requires AVX512BW.

func M256Movepi8Mask ¶

func M256Movepi8Mask(a x86.M256i) (dst x86.Mmask32)

M256Movepi8Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 8-bit integer in 'a'.

FOR j := 0 to 31
	i := j*8
	IF a[i+7]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPMOVB2M'. Intrinsic: '_mm256_movepi8_mask'. Requires AVX512BW.

func M256MovmEpi16 ¶

func M256MovmEpi16(k x86.Mmask16) (dst x86.M256i)

M256MovmEpi16: Set each packed 16-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := 0xFFFF
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVM2W'. Intrinsic: '_mm256_movm_epi16'. Requires AVX512BW.

func M256MovmEpi8 ¶

func M256MovmEpi8(k x86.Mmask32) (dst x86.M256i)

M256MovmEpi8: Set each packed 8-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := 0xFF
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVM2B'. Intrinsic: '_mm256_movm_epi8'. Requires AVX512BW.

func M256Permutex2varEpi16 ¶

func M256Permutex2varEpi16(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	off := 16*idx[i+3:i]
	dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm256_permutex2var_epi16'. Requires AVX512BW.

func M256PermutexvarEpi16 ¶

func M256PermutexvarEpi16(idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256PermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	id := idx[i+3:i]*16
	dst[i+15:i] := a[id+15:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm256_permutexvar_epi16'. Requires AVX512BW.

func M256SllvEpi16 ¶

func M256SllvEpi16(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm256_sllv_epi16'. Requires AVX512BW.

func M256SravEpi16 ¶

func M256SravEpi16(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm256_srav_epi16'. Requires AVX512BW.

func M256SrlvEpi16 ¶

func M256SrlvEpi16(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm256_srlv_epi16'. Requires AVX512BW.

func M256TestEpi16Mask ¶

func M256TestEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256TestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 15
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm256_test_epi16_mask'. Requires AVX512BW.

func M256TestEpi8Mask ¶

func M256TestEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256TestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 31
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm256_test_epi8_mask'. Requires AVX512BW.

func M256TestnEpi16Mask ¶

func M256TestnEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256TestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 15
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm256_testn_epi16_mask'. Requires AVX512BW.

func M256TestnEpi8Mask ¶

func M256TestnEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256TestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 31
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm256_testn_epi8_mask'. Requires AVX512BW.

func M512AbsEpi16 ¶

func M512AbsEpi16(a x86.M512i) (dst x86.M512i)

M512AbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm512_abs_epi16'. Requires AVX512BW.

func M512AbsEpi8 ¶

func M512AbsEpi8(a x86.M512i) (dst x86.M512i)

M512AbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm512_abs_epi8'. Requires AVX512BW.

func M512AddEpi16 ¶

func M512AddEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm512_add_epi16'. Requires AVX512BW.

func M512AddEpi8 ¶

func M512AddEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm512_add_epi8'. Requires AVX512BW.

func M512AddsEpi16 ¶

func M512AddsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm512_adds_epi16'. Requires AVX512BW.

func M512AddsEpi8 ¶

func M512AddsEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm512_adds_epi8'. Requires AVX512BW.

func M512AddsEpu16 ¶

func M512AddsEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm512_adds_epu16'. Requires AVX512BW.

func M512AddsEpu8 ¶

func M512AddsEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm512_adds_epu8'. Requires AVX512BW.

func M512AlignrEpi8 ¶

func M512AlignrEpi8(a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512AlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst'.

FOR j := 0 to 3
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	dst[i+127:i] := tmp[127:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm512_alignr_epi8'. Requires AVX512BW.

func M512AvgEpu16 ¶

func M512AvgEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm512_avg_epu16'. Requires AVX512BW.

func M512AvgEpu8 ¶

func M512AvgEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm512_avg_epu8'. Requires AVX512BW.

func M512BroadcastbEpi8 ¶

func M512BroadcastbEpi8(a x86.M128i) (dst x86.M512i)

M512BroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm512_broadcastb_epi8'. Requires AVX512BW.

func M512BroadcastwEpi16 ¶

func M512BroadcastwEpi16(a x86.M128i) (dst x86.M512i)

M512BroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm512_broadcastw_epi16'. Requires AVX512BW.

func M512BslliEpi128 ¶

func M512BslliEpi128(a x86.M512i, imm8 byte) (dst x86.M512i)

M512BslliEpi128: Shift 128-bit lanes in 'a' left by 'imm8' bytes while shifting in zeros, and store the results in 'dst'.

tmp := imm8[7:0]
IF tmp > 15
	tmp := 16
FI
dst[127:0] := a[127:0] << (tmp*8)
dst[255:128] := a[255:128] << (tmp*8)
dst[383:256] := a[383:256] << (tmp*8)
dst[511:384] := a[511:384] << (tmp*8)
dst[MAX:512] := 0

Instruction: 'VPSLLDQ'. Intrinsic: '_mm512_bslli_epi128'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512BsrliEpi128 ¶

func M512BsrliEpi128(a x86.M512i, imm8 byte) (dst x86.M512i)

M512BsrliEpi128: Shift 128-bit lanes in 'a' right by 'imm8' bytes while shifting in zeros, and store the results in 'dst'.

tmp := imm8[7:0]
IF tmp > 15
	tmp := 16
FI
dst[127:0] := a[127:0] >> (tmp*8)
dst[255:128] := a[255:128] >> (tmp*8)
dst[383:256] := a[383:256] >> (tmp*8)
dst[511:384] := a[511:384] >> (tmp*8)
dst[MAX:512] := 0

Instruction: 'VPSRLDQ'. Intrinsic: '_mm512_bsrli_epi128'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512CmpEpi16Mask ¶

func M512CmpEpi16Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)

M512CmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512CmpEpi8Mask ¶

func M512CmpEpi8Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)

M512CmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512CmpEpu16Mask ¶

func M512CmpEpu16Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)

M512CmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512CmpEpu8Mask ¶

func M512CmpEpu8Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)

M512CmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512CmpeqEpi16Mask ¶

func M512CmpeqEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmpeq_epi16_mask'. Requires AVX512BW.

func M512CmpeqEpi8Mask ¶

func M512CmpeqEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmpeq_epi8_mask'. Requires AVX512BW.

func M512CmpeqEpu16Mask ¶

func M512CmpeqEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmpeq_epu16_mask'. Requires AVX512BW.

func M512CmpeqEpu8Mask ¶

func M512CmpeqEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmpeq_epu8_mask'. Requires AVX512BW.

func M512CmpgeEpi16Mask ¶

func M512CmpgeEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmpge_epi16_mask'. Requires AVX512BW.

func M512CmpgeEpi8Mask ¶

func M512CmpgeEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmpge_epi8_mask'. Requires AVX512BW.

func M512CmpgeEpu16Mask ¶

func M512CmpgeEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmpge_epu16_mask'. Requires AVX512BW.

func M512CmpgeEpu8Mask ¶

func M512CmpgeEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmpge_epu8_mask'. Requires AVX512BW.

func M512CmpgtEpi16Mask ¶

func M512CmpgtEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmpgt_epi16_mask'. Requires AVX512BW.

func M512CmpgtEpi8Mask ¶

func M512CmpgtEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmpgt_epi8_mask'. Requires AVX512BW.

func M512CmpgtEpu16Mask ¶

func M512CmpgtEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmpgt_epu16_mask'. Requires AVX512BW.

func M512CmpgtEpu8Mask ¶

func M512CmpgtEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmpgt_epu8_mask'. Requires AVX512BW.

func M512CmpleEpi16Mask ¶

func M512CmpleEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmple_epi16_mask'. Requires AVX512BW.

func M512CmpleEpi8Mask ¶

func M512CmpleEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmple_epi8_mask'. Requires AVX512BW.

func M512CmpleEpu16Mask ¶

func M512CmpleEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmple_epu16_mask'. Requires AVX512BW.

func M512CmpleEpu8Mask ¶

func M512CmpleEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmple_epu8_mask'. Requires AVX512BW.

func M512CmpltEpi16Mask ¶

func M512CmpltEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmplt_epi16_mask'. Requires AVX512BW.

func M512CmpltEpi8Mask ¶

func M512CmpltEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmplt_epi8_mask'. Requires AVX512BW.

func M512CmpltEpu16Mask ¶

func M512CmpltEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmplt_epu16_mask'. Requires AVX512BW.

func M512CmpltEpu8Mask ¶

func M512CmpltEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmplt_epu8_mask'. Requires AVX512BW.

func M512CmpneqEpi16Mask ¶

func M512CmpneqEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmpneq_epi16_mask'. Requires AVX512BW.

func M512CmpneqEpi8Mask ¶

func M512CmpneqEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmpneq_epi8_mask'. Requires AVX512BW.

func M512CmpneqEpu16Mask ¶

func M512CmpneqEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmpneq_epu16_mask'. Requires AVX512BW.

func M512CmpneqEpu8Mask ¶

func M512CmpneqEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmpneq_epu8_mask'. Requires AVX512BW.

func M512Cvtepi16Epi8 ¶

func M512Cvtepi16Epi8(a x86.M512i) (dst x86.M256i)

M512Cvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm512_cvtepi16_epi8'. Requires AVX512BW.

func M512Cvtepi8Epi16 ¶

func M512Cvtepi8Epi16(a x86.M256i) (dst x86.M512i)

M512Cvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	l := j*16
	dst[l+15:l] := SignExtend(a[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm512_cvtepi8_epi16'. Requires AVX512BW.

func M512Cvtepu8Epi16 ¶

func M512Cvtepu8Epi16(a x86.M256i) (dst x86.M512i)

M512Cvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	l := j*16
	dst[l+15:l] := ZeroExtend(a[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm512_cvtepu8_epi16'. Requires AVX512BW.

func M512Cvtsepi16Epi8 ¶

func M512Cvtsepi16Epi8(a x86.M512i) (dst x86.M256i)

M512Cvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm512_cvtsepi16_epi8'. Requires AVX512BW.

func M512Cvtusepi16Epi8 ¶

func M512Cvtusepi16Epi8(a x86.M512i) (dst x86.M256i)

M512Cvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm512_cvtusepi16_epi8'. Requires AVX512BW.

func M512DbsadEpu8 ¶

func M512DbsadEpu8(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512DbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst'.

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 3
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 7
	i := j*64
	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm512_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512Kunpackd ¶

func M512Kunpackd(a x86.Mmask64, b x86.Mmask64) (dst x86.Mmask64)

M512Kunpackd: Unpack and interleave 32 bits from masks 'a' and 'b', and store the 64-bit result in 'k'.

k[31:0] := a[31:0]
k[63:32] := b[31:0]
k[MAX:64] := 0

Instruction: 'KUNPCKDQ'. Intrinsic: '_mm512_kunpackd'. Requires AVX512BW.

func M512Kunpackw ¶

func M512Kunpackw(a x86.Mmask32, b x86.Mmask32) (dst x86.Mmask32)

M512Kunpackw: Unpack and interleave 16 bits from masks 'a' and 'b', and store the 32-bit result in 'k'.

k[15:0] := a[15:0]
k[31:16] := b[15:0]
k[MAX:32] := 0

Instruction: 'KUNPCKWD'. Intrinsic: '_mm512_kunpackw'. Requires AVX512BW.

func M512MaddEpi16 ¶

func M512MaddEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm512_madd_epi16'. Requires AVX512BW.

func M512MaddubsEpi16 ¶

func M512MaddubsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaddubsEpi16: Vertically multiply each unsigned 8-bit integer from 'a' with the corresponding signed 8-bit integer from 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm512_maddubs_epi16'. Requires AVX512BW.

func M512Mask2Permutex2varEpi16 ¶

func M512Mask2Permutex2varEpi16(a x86.M512i, idx x86.M512i, k x86.Mmask32, b x86.M512i) (dst x86.M512i)

M512Mask2Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		off := 16*idx[i+4:i]
		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := idx[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2W'. Intrinsic: '_mm512_mask2_permutex2var_epi16'. Requires AVX512BW.

func M512MaskAbsEpi16 ¶

func M512MaskAbsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i) (dst x86.M512i)

M512MaskAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm512_mask_abs_epi16'. Requires AVX512BW.

func M512MaskAbsEpi8 ¶

func M512MaskAbsEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i) (dst x86.M512i)

M512MaskAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm512_mask_abs_epi8'. Requires AVX512BW.

func M512MaskAddEpi16 ¶

func M512MaskAddEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm512_mask_add_epi16'. Requires AVX512BW.

func M512MaskAddEpi8 ¶

func M512MaskAddEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm512_mask_add_epi8'. Requires AVX512BW.

func M512MaskAddsEpi16 ¶

func M512MaskAddsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm512_mask_adds_epi16'. Requires AVX512BW.

func M512MaskAddsEpi8 ¶

func M512MaskAddsEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm512_mask_adds_epi8'. Requires AVX512BW.

func M512MaskAddsEpu16 ¶

func M512MaskAddsEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm512_mask_adds_epu16'. Requires AVX512BW.

func M512MaskAddsEpu8 ¶

func M512MaskAddsEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm512_mask_adds_epu8'. Requires AVX512BW.

func M512MaskAlignrEpi8 ¶

func M512MaskAlignrEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	tmp_dst[i+127:i] := tmp[127:0]
ENDFOR

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm512_mask_alignr_epi8'. Requires AVX512BW.

func M512MaskAvgEpu16 ¶

func M512MaskAvgEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm512_mask_avg_epu16'. Requires AVX512BW.

func M512MaskAvgEpu8 ¶

func M512MaskAvgEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm512_mask_avg_epu8'. Requires AVX512BW.

func M512MaskBlendEpi16 ¶

func M512MaskBlendEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskBlendEpi16: Blend packed 16-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := b[i+15:i]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBLENDMW'. Intrinsic: '_mm512_mask_blend_epi16'. Requires AVX512BW.

func M512MaskBlendEpi8 ¶

func M512MaskBlendEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskBlendEpi8: Blend packed 8-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := b[i+7:i]
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBLENDMB'. Intrinsic: '_mm512_mask_blend_epi8'. Requires AVX512BW.

func M512MaskBroadcastbEpi8 ¶

func M512MaskBroadcastbEpi8(src x86.M512i, k x86.Mmask64, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm512_mask_broadcastb_epi8'. Requires AVX512BW.

func M512MaskBroadcastwEpi16 ¶

func M512MaskBroadcastwEpi16(src x86.M512i, k x86.Mmask32, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm512_mask_broadcastw_epi16'. Requires AVX512BW.

func M512MaskCmpEpi16Mask ¶

func M512MaskCmpEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)

M512MaskCmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpEpi8Mask ¶

func M512MaskCmpEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)

M512MaskCmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpEpu16Mask ¶

func M512MaskCmpEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)

M512MaskCmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpEpu8Mask ¶

func M512MaskCmpEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)

M512MaskCmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpeqEpi16Mask ¶

func M512MaskCmpeqEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmpeq_epi16_mask'. Requires AVX512BW.

func M512MaskCmpeqEpi8Mask ¶

func M512MaskCmpeqEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmpeq_epi8_mask'. Requires AVX512BW.

func M512MaskCmpeqEpu16Mask ¶

func M512MaskCmpeqEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmpeq_epu16_mask'. Requires AVX512BW.

func M512MaskCmpeqEpu8Mask ¶

func M512MaskCmpeqEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmpeq_epu8_mask'. Requires AVX512BW.

func M512MaskCmpgeEpi16Mask ¶

func M512MaskCmpgeEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmpge_epi16_mask'. Requires AVX512BW.

func M512MaskCmpgeEpi8Mask ¶

func M512MaskCmpgeEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmpge_epi8_mask'. Requires AVX512BW.

func M512MaskCmpgeEpu16Mask ¶

func M512MaskCmpgeEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmpge_epu16_mask'. Requires AVX512BW.

func M512MaskCmpgeEpu8Mask ¶

func M512MaskCmpgeEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmpge_epu8_mask'. Requires AVX512BW.

func M512MaskCmpgtEpi16Mask ¶

func M512MaskCmpgtEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmpgt_epi16_mask'. Requires AVX512BW.

func M512MaskCmpgtEpi8Mask ¶

func M512MaskCmpgtEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmpgt_epi8_mask'. Requires AVX512BW.

func M512MaskCmpgtEpu16Mask ¶

func M512MaskCmpgtEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmpgt_epu16_mask'. Requires AVX512BW.

func M512MaskCmpgtEpu8Mask ¶

func M512MaskCmpgtEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmpgt_epu8_mask'. Requires AVX512BW.

func M512MaskCmpleEpi16Mask ¶

func M512MaskCmpleEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmple_epi16_mask'. Requires AVX512BW.

func M512MaskCmpleEpi8Mask ¶

func M512MaskCmpleEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmple_epi8_mask'. Requires AVX512BW.

func M512MaskCmpleEpu16Mask ¶

func M512MaskCmpleEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmple_epu16_mask'. Requires AVX512BW.

func M512MaskCmpleEpu8Mask ¶

func M512MaskCmpleEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmple_epu8_mask'. Requires AVX512BW.

func M512MaskCmpltEpi16Mask ¶

func M512MaskCmpltEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmplt_epi16_mask'. Requires AVX512BW.

func M512MaskCmpltEpi8Mask ¶

func M512MaskCmpltEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmplt_epi8_mask'. Requires AVX512BW.

func M512MaskCmpltEpu16Mask ¶

func M512MaskCmpltEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmplt_epu16_mask'. Requires AVX512BW.

func M512MaskCmpltEpu8Mask ¶

func M512MaskCmpltEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmplt_epu8_mask'. Requires AVX512BW.

func M512MaskCmpneqEpi16Mask ¶

func M512MaskCmpneqEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmpneq_epi16_mask'. Requires AVX512BW.

func M512MaskCmpneqEpi8Mask ¶

func M512MaskCmpneqEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmpneq_epi8_mask'. Requires AVX512BW.

func M512MaskCmpneqEpu16Mask ¶

func M512MaskCmpneqEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmpneq_epu16_mask'. Requires AVX512BW.

func M512MaskCmpneqEpu8Mask ¶

func M512MaskCmpneqEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmpneq_epu8_mask'. Requires AVX512BW.

func M512MaskCvtepi16Epi8 ¶

func M512MaskCvtepi16Epi8(src x86.M256i, k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm512_mask_cvtepi16_epi8'. Requires AVX512BW.

func M512MaskCvtepi8Epi16 ¶

func M512MaskCvtepi8Epi16(src x86.M512i, k x86.Mmask32, a x86.M256i) (dst x86.M512i)

M512MaskCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm512_mask_cvtepi8_epi16'. Requires AVX512BW.

func M512MaskCvtepu8Epi16 ¶

func M512MaskCvtepu8Epi16(src x86.M512i, k x86.Mmask32, a x86.M256i) (dst x86.M512i)

M512MaskCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm512_mask_cvtepu8_epi16'. Requires AVX512BW.

func M512MaskCvtsepi16Epi8 ¶

func M512MaskCvtsepi16Epi8(src x86.M256i, k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm512_mask_cvtsepi16_epi8'. Requires AVX512BW.

func M512MaskCvtusepi16Epi8 ¶

func M512MaskCvtusepi16Epi8(src x86.M256i, k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm512_mask_cvtusepi16_epi8'. Requires AVX512BW.

func M512MaskDbsadEpu8 ¶

func M512MaskDbsadEpu8(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 3
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 7
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm512_mask_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskMaddEpi16 ¶

func M512MaskMaddEpi16(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm512_mask_madd_epi16'. Requires AVX512BW.

func M512MaskMaddubsEpi16 ¶

func M512MaskMaddubsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm512_mask_maddubs_epi16'. Requires AVX512BW.

func M512MaskMaxEpi16 ¶

func M512MaskMaxEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm512_mask_max_epi16'. Requires AVX512BW.

func M512MaskMaxEpi8 ¶

func M512MaskMaxEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm512_mask_max_epi8'. Requires AVX512BW.

func M512MaskMaxEpu16 ¶

func M512MaskMaxEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm512_mask_max_epu16'. Requires AVX512BW.

func M512MaskMaxEpu8 ¶

func M512MaskMaxEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm512_mask_max_epu8'. Requires AVX512BW.

func M512MaskMinEpi16 ¶

func M512MaskMinEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm512_mask_min_epi16'. Requires AVX512BW.

func M512MaskMinEpi8 ¶

func M512MaskMinEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm512_mask_min_epi8'. Requires AVX512BW.

func M512MaskMinEpu16 ¶

func M512MaskMinEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm512_mask_min_epu16'. Requires AVX512BW.

func M512MaskMinEpu8 ¶

func M512MaskMinEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm512_mask_min_epu8'. Requires AVX512BW.

func M512MaskMovEpi16 ¶

func M512MaskMovEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i) (dst x86.M512i)

M512MaskMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm512_mask_mov_epi16'. Requires AVX512BW.

func M512MaskMovEpi8 ¶

func M512MaskMovEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i) (dst x86.M512i)

M512MaskMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm512_mask_mov_epi8'. Requires AVX512BW.

func M512MaskMulhiEpi16 ¶

func M512MaskMulhiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm512_mask_mulhi_epi16'. Requires AVX512BW.

func M512MaskMulhiEpu16 ¶

func M512MaskMulhiEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm512_mask_mulhi_epu16'. Requires AVX512BW.

func M512MaskMulhrsEpi16 ¶

func M512MaskMulhrsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm512_mask_mulhrs_epi16'. Requires AVX512BW.

func M512MaskMulloEpi16 ¶

func M512MaskMulloEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm512_mask_mullo_epi16'. Requires AVX512BW.

func M512MaskPacksEpi16 ¶

func M512MaskPacksEpi16(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm512_mask_packs_epi16'. Requires AVX512BW.

func M512MaskPacksEpi32 ¶

func M512MaskPacksEpi32(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm512_mask_packs_epi32'. Requires AVX512BW.

func M512MaskPackusEpi16 ¶

func M512MaskPackusEpi16(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm512_mask_packus_epi16'. Requires AVX512BW.

func M512MaskPackusEpi32 ¶

func M512MaskPackusEpi32(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm512_mask_packus_epi32'. Requires AVX512BW.

func M512MaskPermutex2varEpi16 ¶

func M512MaskPermutex2varEpi16(a x86.M512i, k x86.Mmask32, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		off := 16*idx[i+4:i]
		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMT2W'. Intrinsic: '_mm512_mask_permutex2var_epi16'. Requires AVX512BW.

func M512MaskPermutexvarEpi16 ¶

func M512MaskPermutexvarEpi16(src x86.M512i, k x86.Mmask32, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskPermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	id := idx[i+4:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm512_mask_permutexvar_epi16'. Requires AVX512BW.

func M512MaskSet1Epi16 ¶

func M512MaskSet1Epi16(src x86.M512i, k x86.Mmask32, a int16) (dst x86.M512i)

M512MaskSet1Epi16: Broadcast 16-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm512_mask_set1_epi16'. Requires AVX512BW.

func M512MaskSet1Epi8 ¶

func M512MaskSet1Epi8(src x86.M512i, k x86.Mmask64, a byte) (dst x86.M512i)

M512MaskSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm512_mask_set1_epi8'. Requires AVX512BW.

func M512MaskShuffleEpi8 ¶

func M512MaskShuffleEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskShuffleEpi8: Shuffle 8-bit integers in 'a' within 128-bit lanes using the control in the corresponding 8-bit element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm512_mask_shuffle_epi8'. Requires AVX512BW.

func M512MaskShufflehiEpi16 ¶

func M512MaskShufflehiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
tmp_dst[319:256] := a[319:256]
tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320]
tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320]
tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320]
tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320]
tmp_dst[447:384] := a[447:384]
tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448]
tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448]
tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448]
tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448]

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm512_mask_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskShuffleloEpi16 ¶

func M512MaskShuffleloEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]
tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256]
tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256]
tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256]
tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256]
tmp_dst[383:320] := a[383:320]
tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384]
tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384]
tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384]
tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384]
tmp_dst[511:448] := a[511:448]

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm512_mask_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskSllEpi16 ¶

func M512MaskSllEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_mask_sll_epi16'. Requires AVX512BW.

func M512MaskSlliEpi16 ¶

func M512MaskSlliEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_mask_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskSllvEpi16 ¶

func M512MaskSllvEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm512_mask_sllv_epi16'. Requires AVX512BW.

func M512MaskSraEpi16 ¶

func M512MaskSraEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_mask_sra_epi16'. Requires AVX512BW.

func M512MaskSraiEpi16 ¶

func M512MaskSraiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_mask_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskSravEpi16 ¶

func M512MaskSravEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm512_mask_srav_epi16'. Requires AVX512BW.

func M512MaskSrlEpi16 ¶

func M512MaskSrlEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_mask_srl_epi16'. Requires AVX512BW.

func M512MaskSrliEpi16 ¶

func M512MaskSrliEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_mask_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskSrlvEpi16 ¶

func M512MaskSrlvEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm512_mask_srlv_epi16'. Requires AVX512BW.

func M512MaskSubEpi16 ¶

func M512MaskSubEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm512_mask_sub_epi16'. Requires AVX512BW.

func M512MaskSubEpi8 ¶

func M512MaskSubEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm512_mask_sub_epi8'. Requires AVX512BW.

func M512MaskSubsEpi16 ¶

func M512MaskSubsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm512_mask_subs_epi16'. Requires AVX512BW.

func M512MaskSubsEpi8 ¶

func M512MaskSubsEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm512_mask_subs_epi8'. Requires AVX512BW.

func M512MaskSubsEpu16 ¶

func M512MaskSubsEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm512_mask_subs_epu16'. Requires AVX512BW.

func M512MaskSubsEpu8 ¶

func M512MaskSubsEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm512_mask_subs_epu8'. Requires AVX512BW.

func M512MaskTestEpi16Mask ¶

func M512MaskTestEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskTestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm512_mask_test_epi16_mask'. Requires AVX512BW.

func M512MaskTestEpi8Mask ¶

func M512MaskTestEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskTestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm512_mask_test_epi8_mask'. Requires AVX512BW.

func M512MaskTestnEpi16Mask ¶

func M512MaskTestnEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskTestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm512_mask_testn_epi16_mask'. Requires AVX512BW.

func M512MaskTestnEpi8Mask ¶

func M512MaskTestnEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskTestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm512_mask_testn_epi8_mask'. Requires AVX512BW.

func M512MaskUnpackhiEpi16 ¶

func M512MaskUnpackhiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm512_mask_unpackhi_epi16'. Requires AVX512BW.

func M512MaskUnpackhiEpi8 ¶

func M512MaskUnpackhiEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm512_mask_unpackhi_epi8'. Requires AVX512BW.

func M512MaskUnpackloEpi16 ¶

func M512MaskUnpackloEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm512_mask_unpacklo_epi16'. Requires AVX512BW.

func M512MaskUnpackloEpi8 ¶

func M512MaskUnpackloEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm512_mask_unpacklo_epi8'. Requires AVX512BW.

func M512MaskzAbsEpi16 ¶

func M512MaskzAbsEpi16(k x86.Mmask32, a x86.M512i) (dst x86.M512i)

M512MaskzAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm512_maskz_abs_epi16'. Requires AVX512BW.

func M512MaskzAbsEpi8 ¶

func M512MaskzAbsEpi8(k x86.Mmask64, a x86.M512i) (dst x86.M512i)

M512MaskzAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm512_maskz_abs_epi8'. Requires AVX512BW.

func M512MaskzAddEpi16 ¶

func M512MaskzAddEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm512_maskz_add_epi16'. Requires AVX512BW.

func M512MaskzAddEpi8 ¶

func M512MaskzAddEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm512_maskz_add_epi8'. Requires AVX512BW.

func M512MaskzAddsEpi16 ¶

func M512MaskzAddsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm512_maskz_adds_epi16'. Requires AVX512BW.

func M512MaskzAddsEpi8 ¶

func M512MaskzAddsEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm512_maskz_adds_epi8'. Requires AVX512BW.

func M512MaskzAddsEpu16 ¶

func M512MaskzAddsEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm512_maskz_adds_epu16'. Requires AVX512BW.

func M512MaskzAddsEpu8 ¶

func M512MaskzAddsEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm512_maskz_adds_epu8'. Requires AVX512BW.

func M512MaskzAlignrEpi8 ¶

func M512MaskzAlignrEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskzAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	tmp_dst[i+127:i] := tmp[127:0]
ENDFOR

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm512_maskz_alignr_epi8'. Requires AVX512BW.

func M512MaskzAvgEpu16 ¶

func M512MaskzAvgEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm512_maskz_avg_epu16'. Requires AVX512BW.

func M512MaskzAvgEpu8 ¶

func M512MaskzAvgEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm512_maskz_avg_epu8'. Requires AVX512BW.

func M512MaskzBroadcastbEpi8 ¶

func M512MaskzBroadcastbEpi8(k x86.Mmask64, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm512_maskz_broadcastb_epi8'. Requires AVX512BW.

func M512MaskzBroadcastwEpi16 ¶

func M512MaskzBroadcastwEpi16(k x86.Mmask32, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm512_maskz_broadcastw_epi16'. Requires AVX512BW.

func M512MaskzCvtepi16Epi8 ¶

func M512MaskzCvtepi16Epi8(k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskzCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm512_maskz_cvtepi16_epi8'. Requires AVX512BW.

func M512MaskzCvtepi8Epi16 ¶

func M512MaskzCvtepi8Epi16(k x86.Mmask32, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm512_maskz_cvtepi8_epi16'. Requires AVX512BW.

func M512MaskzCvtepu8Epi16 ¶

func M512MaskzCvtepu8Epi16(k x86.Mmask32, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm512_maskz_cvtepu8_epi16'. Requires AVX512BW.

func M512MaskzCvtsepi16Epi8 ¶

func M512MaskzCvtsepi16Epi8(k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskzCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm512_maskz_cvtsepi16_epi8'. Requires AVX512BW.

func M512MaskzCvtusepi16Epi8 ¶

func M512MaskzCvtusepi16Epi8(k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskzCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm512_maskz_cvtusepi16_epi8'. Requires AVX512BW.

func M512MaskzDbsadEpu8 ¶

func M512MaskzDbsadEpu8(k x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 3
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 7
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm512_maskz_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzMaddEpi16 ¶

func M512MaskzMaddEpi16(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm512_maskz_madd_epi16'. Requires AVX512BW.

func M512MaskzMaddubsEpi16 ¶

func M512MaskzMaddubsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm512_maskz_maddubs_epi16'. Requires AVX512BW.

func M512MaskzMaxEpi16 ¶

func M512MaskzMaxEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm512_maskz_max_epi16'. Requires AVX512BW.

func M512MaskzMaxEpi8 ¶

func M512MaskzMaxEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm512_maskz_max_epi8'. Requires AVX512BW.

func M512MaskzMaxEpu16 ¶

func M512MaskzMaxEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm512_maskz_max_epu16'. Requires AVX512BW.

func M512MaskzMaxEpu8 ¶

func M512MaskzMaxEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm512_maskz_max_epu8'. Requires AVX512BW.

func M512MaskzMinEpi16 ¶

func M512MaskzMinEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm512_maskz_min_epi16'. Requires AVX512BW.

func M512MaskzMinEpi8 ¶

func M512MaskzMinEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm512_maskz_min_epi8'. Requires AVX512BW.

func M512MaskzMinEpu16 ¶

func M512MaskzMinEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm512_maskz_min_epu16'. Requires AVX512BW.

func M512MaskzMinEpu8 ¶

func M512MaskzMinEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm512_maskz_min_epu8'. Requires AVX512BW.

func M512MaskzMovEpi16 ¶

func M512MaskzMovEpi16(k x86.Mmask32, a x86.M512i) (dst x86.M512i)

M512MaskzMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm512_maskz_mov_epi16'. Requires AVX512BW.

func M512MaskzMovEpi8 ¶

func M512MaskzMovEpi8(k x86.Mmask64, a x86.M512i) (dst x86.M512i)

M512MaskzMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm512_maskz_mov_epi8'. Requires AVX512BW.

func M512MaskzMulhiEpi16 ¶

func M512MaskzMulhiEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm512_maskz_mulhi_epi16'. Requires AVX512BW.

func M512MaskzMulhiEpu16 ¶

func M512MaskzMulhiEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := o
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm512_maskz_mulhi_epu16'. Requires AVX512BW.

func M512MaskzMulhrsEpi16 ¶

func M512MaskzMulhrsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := 9
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm512_maskz_mulhrs_epi16'. Requires AVX512BW.

func M512MaskzMulloEpi16 ¶

func M512MaskzMulloEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm512_maskz_mullo_epi16'. Requires AVX512BW.

func M512MaskzPacksEpi16 ¶

func M512MaskzPacksEpi16(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm512_maskz_packs_epi16'. Requires AVX512BW.

func M512MaskzPacksEpi32 ¶

func M512MaskzPacksEpi32(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm512_maskz_packs_epi32'. Requires AVX512BW.

func M512MaskzPackusEpi16 ¶

func M512MaskzPackusEpi16(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm512_maskz_packus_epi16'. Requires AVX512BW.

func M512MaskzPackusEpi32 ¶

func M512MaskzPackusEpi32(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm512_maskz_packus_epi32'. Requires AVX512BW.

func M512MaskzPermutex2varEpi16 ¶

func M512MaskzPermutex2varEpi16(k x86.Mmask32, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		off := 16*idx[i+4:i]
		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm512_maskz_permutex2var_epi16'. Requires AVX512BW.

func M512MaskzPermutexvarEpi16 ¶

func M512MaskzPermutexvarEpi16(k x86.Mmask32, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskzPermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	id := idx[i+4:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm512_maskz_permutexvar_epi16'. Requires AVX512BW.

func M512MaskzSet1Epi16 ¶

func M512MaskzSet1Epi16(k x86.Mmask32, a int16) (dst x86.M512i)

M512MaskzSet1Epi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm512_maskz_set1_epi16'. Requires AVX512BW.

func M512MaskzSet1Epi8 ¶

func M512MaskzSet1Epi8(k x86.Mmask64, a byte) (dst x86.M512i)

M512MaskzSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm512_maskz_set1_epi8'. Requires AVX512BW.

func M512MaskzShuffleEpi8 ¶

func M512MaskzShuffleEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm512_maskz_shuffle_epi8'. Requires AVX512BW.

func M512MaskzShufflehiEpi16 ¶

func M512MaskzShufflehiEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
tmp_dst[319:256] := a[319:256]
tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320]
tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320]
tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320]
tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320]
tmp_dst[447:384] := a[447:384]
tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448]
tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448]
tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448]
tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448]

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm512_maskz_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzShuffleloEpi16 ¶

func M512MaskzShuffleloEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]
tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256]
tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256]
tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256]
tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256]
tmp_dst[383:320] := a[383:320]
tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384]
tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384]
tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384]
tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384]
tmp_dst[511:448] := a[511:448]

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm512_maskz_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzSllEpi16 ¶

func M512MaskzSllEpi16(k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_maskz_sll_epi16'. Requires AVX512BW.

func M512MaskzSlliEpi16 ¶

func M512MaskzSlliEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_maskz_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzSllvEpi16 ¶

func M512MaskzSllvEpi16(k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm512_maskz_sllv_epi16'. Requires AVX512BW.

func M512MaskzSraEpi16 ¶

func M512MaskzSraEpi16(k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_maskz_sra_epi16'. Requires AVX512BW.

func M512MaskzSraiEpi16 ¶

func M512MaskzSraiEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_maskz_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzSravEpi16 ¶

func M512MaskzSravEpi16(k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm512_maskz_srav_epi16'. Requires AVX512BW.

func M512MaskzSrlEpi16 ¶

func M512MaskzSrlEpi16(k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_maskz_srl_epi16'. Requires AVX512BW.

func M512MaskzSrliEpi16 ¶

func M512MaskzSrliEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_maskz_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzSrlvEpi16 ¶

func M512MaskzSrlvEpi16(k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm512_maskz_srlv_epi16'. Requires AVX512BW.

func M512MaskzSubEpi16 ¶

func M512MaskzSubEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm512_maskz_sub_epi16'. Requires AVX512BW.

func M512MaskzSubEpi8 ¶

func M512MaskzSubEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm512_maskz_sub_epi8'. Requires AVX512BW.

func M512MaskzSubsEpi16 ¶

func M512MaskzSubsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm512_maskz_subs_epi16'. Requires AVX512BW.

func M512MaskzSubsEpi8 ¶

func M512MaskzSubsEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm512_maskz_subs_epi8'. Requires AVX512BW.

func M512MaskzSubsEpu16 ¶

func M512MaskzSubsEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm512_maskz_subs_epu16'. Requires AVX512BW.

func M512MaskzSubsEpu8 ¶

func M512MaskzSubsEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm512_maskz_subs_epu8'. Requires AVX512BW.

func M512MaskzUnpackhiEpi16 ¶

func M512MaskzUnpackhiEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm512_maskz_unpackhi_epi16'. Requires AVX512BW.

func M512MaskzUnpackhiEpi8 ¶

func M512MaskzUnpackhiEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm512_maskz_unpackhi_epi8'. Requires AVX512BW.

func M512MaskzUnpackloEpi16 ¶

func M512MaskzUnpackloEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm512_maskz_unpacklo_epi16'. Requires AVX512BW.

func M512MaskzUnpackloEpi8 ¶

func M512MaskzUnpackloEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm512_maskz_unpacklo_epi8'. Requires AVX512BW.

func M512MaxEpi16 ¶

func M512MaxEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF a[i+15:i] > b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm512_max_epi16'. Requires AVX512BW.

func M512MaxEpi8 ¶

func M512MaxEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF a[i+7:i] > b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm512_max_epi8'. Requires AVX512BW.

func M512MaxEpu16 ¶

func M512MaxEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF a[i+15:i] > b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm512_max_epu16'. Requires AVX512BW.

func M512MaxEpu8 ¶

func M512MaxEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF a[i+7:i] > b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm512_max_epu8'. Requires AVX512BW.

func M512MinEpi16 ¶

func M512MinEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF a[i+15:i] < b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm512_min_epi16'. Requires AVX512BW.

func M512MinEpi8 ¶

func M512MinEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF a[i+7:i] < b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm512_min_epi8'. Requires AVX512BW.

func M512MinEpu16 ¶

func M512MinEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF a[i+15:i] < b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm512_min_epu16'. Requires AVX512BW.

func M512MinEpu8 ¶

func M512MinEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF a[i+7:i] < b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm512_min_epu8'. Requires AVX512BW.

func M512Movepi16Mask ¶

func M512Movepi16Mask(a x86.M512i) (dst x86.Mmask32)

M512Movepi16Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 16-bit integer in 'a'.

FOR j := 0 to 31
	i := j*16
	IF a[i+15]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPMOVW2M'. Intrinsic: '_mm512_movepi16_mask'. Requires AVX512BW.

func M512Movepi8Mask ¶

func M512Movepi8Mask(a x86.M512i) (dst x86.Mmask64)

M512Movepi8Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 8-bit integer in 'a'.

FOR j := 0 to 63
	i := j*8
	IF a[i+7]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPMOVB2M'. Intrinsic: '_mm512_movepi8_mask'. Requires AVX512BW.

func M512MovmEpi16 ¶

func M512MovmEpi16(k x86.Mmask32) (dst x86.M512i)

M512MovmEpi16: Set each packed 16-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := 0xFFFF
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVM2W'. Intrinsic: '_mm512_movm_epi16'. Requires AVX512BW.

func M512MovmEpi8 ¶

func M512MovmEpi8(k x86.Mmask64) (dst x86.M512i)

M512MovmEpi8: Set each packed 8-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := 0xFF
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVM2B'. Intrinsic: '_mm512_movm_epi8'. Requires AVX512BW.

func M512MulhiEpi16 ¶

func M512MulhiEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 31
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm512_mulhi_epi16'. Requires AVX512BW.

func M512MulhiEpu16 ¶

func M512MulhiEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 31
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm512_mulhi_epu16'. Requires AVX512BW.

func M512MulhrsEpi16 ¶

func M512MulhrsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst'.

FOR j := 0 to 31
	i := j*16
	tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
	dst[i+15:i] := tmp[16:1]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm512_mulhrs_epi16'. Requires AVX512BW.

func M512MulloEpi16 ¶

func M512MulloEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 31
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[15:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm512_mullo_epi16'. Requires AVX512BW.

func M512PacksEpi16 ¶

func M512PacksEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512PacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst'.

dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])
dst[MAX:512] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm512_packs_epi16'. Requires AVX512BW.

func M512PacksEpi32 ¶

func M512PacksEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512PacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst'.

dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])
dst[MAX:512] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm512_packs_epi32'. Requires AVX512BW.

func M512PackusEpi16 ¶

func M512PackusEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512PackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst'.

dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])
dst[MAX:512] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm512_packus_epi16'. Requires AVX512BW.

func M512PackusEpi32 ¶

func M512PackusEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512PackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst'.

dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])
dst[MAX:512] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm512_packus_epi32'. Requires AVX512BW.

func M512Permutex2varEpi16 ¶

func M512Permutex2varEpi16(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	off := 16*idx[i+4:i]
	dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm512_permutex2var_epi16'. Requires AVX512BW.

func M512PermutexvarEpi16 ¶

func M512PermutexvarEpi16(idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512PermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	id := idx[i+4:i]*16
	dst[i+15:i] := a[id+15:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm512_permutexvar_epi16'. Requires AVX512BW.

func M512SadEpu8 ¶

func M512SadEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SadEpu8: Compute the absolute differences of packed unsigned 8-bit integers in 'a' and 'b', then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in 'dst'.

FOR j := 0 to 63
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR
FOR j := 0 to 7
	i := j*64
	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
	dst[i+63:i+16] := 0
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSADBW'. Intrinsic: '_mm512_sad_epu8'. Requires AVX512BW.

func M512ShuffleEpi8 ¶

func M512ShuffleEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512ShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF b[i+7] == 1
		dst[i+7:i] := 0
	ELSE
		index[3:0] := b[i+3:i]
		dst[i+7:i] := a[index*8+7:index*8]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm512_shuffle_epi8'. Requires AVX512BW.

func M512ShufflehiEpi16 ¶

func M512ShufflehiEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)

M512ShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst'.

dst[63:0] := a[63:0]
dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
dst[191:128] := a[191:128]
dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
dst[319:256] := a[319:256]
dst[335:320] := (a >> (imm8[1:0] * 16))[335:320]
dst[351:336] := (a >> (imm8[3:2] * 16))[335:320]
dst[367:352] := (a >> (imm8[5:4] * 16))[335:320]
dst[383:368] := (a >> (imm8[7:6] * 16))[335:320]
dst[447:384] := a[447:384]
dst[463:448] := (a >> (imm8[1:0] * 16))[463:448]
dst[479:464] := (a >> (imm8[3:2] * 16))[463:448]
dst[495:480] := (a >> (imm8[5:4] * 16))[463:448]
dst[511:496] := (a >> (imm8[7:6] * 16))[463:448]
dst[MAX:512] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm512_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512ShuffleloEpi16 ¶

func M512ShuffleloEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)

M512ShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst'.

dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
dst[127:64] := a[127:64]
dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
dst[255:192] := a[255:192]
dst[271:256] := (a >> (imm8[1:0] * 16))[271:256]
dst[287:272] := (a >> (imm8[3:2] * 16))[271:256]
dst[303:288] := (a >> (imm8[5:4] * 16))[271:256]
dst[319:304] := (a >> (imm8[7:6] * 16))[271:256]
dst[383:320] := a[383:320]
dst[399:384] := (a >> (imm8[1:0] * 16))[399:384]
dst[415:400] := (a >> (imm8[3:2] * 16))[399:384]
dst[431:416] := (a >> (imm8[5:4] * 16))[399:384]
dst[447:432] := (a >> (imm8[7:6] * 16))[399:384]
dst[511:448] := a[511:448]
dst[MAX:512] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm512_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512SllEpi16 ¶

func M512SllEpi16(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF count[63:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_sll_epi16'. Requires AVX512BW.

func M512SlliEpi16 ¶

func M512SlliEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF imm8[7:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512SllvEpi16 ¶

func M512SllvEpi16(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm512_sllv_epi16'. Requires AVX512BW.

func M512SraEpi16 ¶

func M512SraEpi16(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF count[63:0] > 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_sra_epi16'. Requires AVX512BW.

func M512SraiEpi16 ¶

func M512SraiEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF imm8[7:0] > 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512SravEpi16 ¶

func M512SravEpi16(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm512_srav_epi16'. Requires AVX512BW.

func M512SrlEpi16 ¶

func M512SrlEpi16(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF count[63:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_srl_epi16'. Requires AVX512BW.

func M512SrliEpi16 ¶

func M512SrliEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF imm8[7:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512SrlvEpi16 ¶

func M512SrlvEpi16(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm512_srlv_epi16'. Requires AVX512BW.

func M512SubEpi16 ¶

func M512SubEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := a[i+15:i] - b[i+15:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm512_sub_epi16'. Requires AVX512BW.

func M512SubEpi8 ¶

func M512SubEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := a[i+7:i] - b[i+7:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm512_sub_epi8'. Requires AVX512BW.

func M512SubsEpi16 ¶

func M512SubsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm512_subs_epi16'. Requires AVX512BW.

func M512SubsEpi8 ¶

func M512SubsEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm512_subs_epi8'. Requires AVX512BW.

func M512SubsEpu16 ¶

func M512SubsEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm512_subs_epu16'. Requires AVX512BW.

func M512SubsEpu8 ¶

func M512SubsEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm512_subs_epu8'. Requires AVX512BW.

func M512TestEpi16Mask ¶

func M512TestEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512TestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 31
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm512_test_epi16_mask'. Requires AVX512BW.

func M512TestEpi8Mask ¶

func M512TestEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512TestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 63
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm512_test_epi8_mask'. Requires AVX512BW.

func M512TestnEpi16Mask ¶

func M512TestnEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512TestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 31
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm512_testn_epi16_mask'. Requires AVX512BW.

func M512TestnEpi8Mask ¶

func M512TestnEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512TestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 63
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm512_testn_epi8_mask'. Requires AVX512BW.

func M512UnpackhiEpi16 ¶

func M512UnpackhiEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm512_unpackhi_epi16'. Requires AVX512BW.

func M512UnpackhiEpi8 ¶

func M512UnpackhiEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm512_unpackhi_epi8'. Requires AVX512BW.

func M512UnpackloEpi16 ¶

func M512UnpackloEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm512_unpacklo_epi16'. Requires AVX512BW.

func M512UnpackloEpi8 ¶

func M512UnpackloEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm512_unpacklo_epi8'. Requires AVX512BW.

func Mask2Permutex2varEpi16 ¶

func Mask2Permutex2varEpi16(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)

Mask2Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		off := 16*idx[i+2:i]
		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := idx[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2W'. Intrinsic: '_mm_mask2_permutex2var_epi16'. Requires AVX512BW.

func MaskAbsEpi16 ¶

func MaskAbsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm_mask_abs_epi16'. Requires AVX512BW.

func MaskAbsEpi8 ¶

func MaskAbsEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm_mask_abs_epi8'. Requires AVX512BW.

func MaskAddEpi16 ¶

func MaskAddEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm_mask_add_epi16'. Requires AVX512BW.

func MaskAddEpi8 ¶

func MaskAddEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm_mask_add_epi8'. Requires AVX512BW.

func MaskAddsEpi16 ¶

func MaskAddsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm_mask_adds_epi16'. Requires AVX512BW.

func MaskAddsEpi8 ¶

func MaskAddsEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm_mask_adds_epi8'. Requires AVX512BW.

func MaskAddsEpu16 ¶

func MaskAddsEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm_mask_adds_epu16'. Requires AVX512BW.

func MaskAddsEpu8 ¶

func MaskAddsEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm_mask_adds_epu8'. Requires AVX512BW.

func MaskAlignrEpi8 ¶

func MaskAlignrEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i, count int) (dst x86.M128i)

MaskAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8)

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm_mask_alignr_epi8'. Requires AVX512BW.

func MaskAvgEpu16 ¶

func MaskAvgEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm_mask_avg_epu16'. Requires AVX512BW.

func MaskAvgEpu8 ¶

func MaskAvgEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm_mask_avg_epu8'. Requires AVX512BW.

func MaskBlendEpi16 ¶

func MaskBlendEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskBlendEpi16: Blend packed 16-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := b[i+15:i]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBLENDMW'. Intrinsic: '_mm_mask_blend_epi16'. Requires AVX512BW.

func MaskBlendEpi8 ¶

func MaskBlendEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskBlendEpi8: Blend packed 8-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := b[i+7:i]
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBLENDMB'. Intrinsic: '_mm_mask_blend_epi8'. Requires AVX512BW.

func MaskBroadcastbEpi8 ¶

func MaskBroadcastbEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm_mask_broadcastb_epi8'. Requires AVX512BW.

func MaskBroadcastwEpi16 ¶

func MaskBroadcastwEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm_mask_broadcastw_epi16'. Requires AVX512BW.

func MaskCmpEpi16Mask ¶

func MaskCmpEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpi8Mask ¶

func MaskCmpEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)

MaskCmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpu16Mask ¶

func MaskCmpEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpu8Mask ¶

func MaskCmpEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)

MaskCmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskCmpeqEpi16Mask ¶

func MaskCmpeqEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmpeq_epi16_mask'. Requires AVX512BW.

func MaskCmpeqEpi8Mask ¶

func MaskCmpeqEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmpeq_epi8_mask'. Requires AVX512BW.

func MaskCmpeqEpu16Mask ¶

func MaskCmpeqEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmpeq_epu16_mask'. Requires AVX512BW.

func MaskCmpeqEpu8Mask ¶

func MaskCmpeqEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmpeq_epu8_mask'. Requires AVX512BW.

func MaskCmpgeEpi16Mask ¶

func MaskCmpgeEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmpge_epi16_mask'. Requires AVX512BW.

func MaskCmpgeEpi8Mask ¶

func MaskCmpgeEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmpge_epi8_mask'. Requires AVX512BW.

func MaskCmpgeEpu16Mask ¶

func MaskCmpgeEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmpge_epu16_mask'. Requires AVX512BW.

func MaskCmpgeEpu8Mask ¶

func MaskCmpgeEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmpge_epu8_mask'. Requires AVX512BW.

func MaskCmpgtEpi16Mask ¶

func MaskCmpgtEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >== b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmpgt_epi16_mask'. Requires AVX512BW.

func MaskCmpgtEpi8Mask ¶

func MaskCmpgtEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmpgt_epi8_mask'. Requires AVX512BW.

func MaskCmpgtEpu16Mask ¶

func MaskCmpgtEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >== b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmpgt_epu16_mask'. Requires AVX512BW.

func MaskCmpgtEpu8Mask ¶

func MaskCmpgtEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmpgt_epu8_mask'. Requires AVX512BW.

func MaskCmpleEpi16Mask ¶

func MaskCmpleEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmple_epi16_mask'. Requires AVX512BW.

func MaskCmpleEpi8Mask ¶

func MaskCmpleEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmple_epi8_mask'. Requires AVX512BW.

func MaskCmpleEpu16Mask ¶

func MaskCmpleEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmple_epu16_mask'. Requires AVX512BW.

func MaskCmpleEpu8Mask ¶

func MaskCmpleEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmple_epu8_mask'. Requires AVX512BW.

func MaskCmpltEpi16Mask ¶

func MaskCmpltEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmplt_epi16_mask'. Requires AVX512BW.

func MaskCmpltEpi8Mask ¶

func MaskCmpltEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmplt_epi8_mask'. Requires AVX512BW.

func MaskCmpltEpu16Mask ¶

func MaskCmpltEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmplt_epu16_mask'. Requires AVX512BW.

func MaskCmpltEpu8Mask ¶

func MaskCmpltEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmplt_epu8_mask'. Requires AVX512BW.

func MaskCmpneqEpi16Mask ¶

func MaskCmpneqEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmpneq_epi16_mask'. Requires AVX512BW.

func MaskCmpneqEpi8Mask ¶

func MaskCmpneqEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmpneq_epi8_mask'. Requires AVX512BW.

func MaskCmpneqEpu16Mask ¶

func MaskCmpneqEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmpneq_epu16_mask'. Requires AVX512BW.

func MaskCmpneqEpu8Mask ¶

func MaskCmpneqEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmpneq_epu8_mask'. Requires AVX512BW.

func MaskCvtepi16Epi8 ¶

func MaskCvtepi16Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm_mask_cvtepi16_epi8'. Requires AVX512BW.

func MaskCvtepi8Epi16 ¶

func MaskCvtepi8Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm_mask_cvtepi8_epi16'. Requires AVX512BW.

func MaskCvtepu8Epi16 ¶

func MaskCvtepu8Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm_mask_cvtepu8_epi16'. Requires AVX512BW.

func MaskCvtsepi16Epi8 ¶

func MaskCvtsepi16Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm_mask_cvtsepi16_epi8'. Requires AVX512BW.

func MaskCvtusepi16Epi8 ¶

func MaskCvtusepi16Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm_mask_cvtusepi16_epi8'. Requires AVX512BW.

func MaskDbsadEpu8 ¶

func MaskDbsadEpu8(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

MaskDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

tmp[31:0] := select(b[127:0], imm8[1:0])
tmp[63:32] := select(b[127:0], imm8[3:2])
tmp[95:64] := select(b[127:0], imm8[5:4])
tmp[127:96] := select(b[127:0], imm8[7:6])

FOR j := 0 to 1
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm_mask_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskMaddEpi16 ¶

func MaskMaddEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm_mask_madd_epi16'. Requires AVX512BW.

func MaskMaddubsEpi16 ¶

func MaskMaddubsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm_mask_maddubs_epi16'. Requires AVX512BW.

func MaskMaxEpi16 ¶

func MaskMaxEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm_mask_max_epi16'. Requires AVX512BW.

func MaskMaxEpi8 ¶

func MaskMaxEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm_mask_max_epi8'. Requires AVX512BW.

func MaskMaxEpu16 ¶

func MaskMaxEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm_mask_max_epu16'. Requires AVX512BW.

func MaskMaxEpu8 ¶

func MaskMaxEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm_mask_max_epu8'. Requires AVX512BW.

func MaskMinEpi16 ¶

func MaskMinEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm_mask_min_epi16'. Requires AVX512BW.

func MaskMinEpi8 ¶

func MaskMinEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm_mask_min_epi8'. Requires AVX512BW.

func MaskMinEpu16 ¶

func MaskMinEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm_mask_min_epu16'. Requires AVX512BW.

func MaskMinEpu8 ¶

func MaskMinEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm_mask_min_epu8'. Requires AVX512BW.

func MaskMovEpi16 ¶

func MaskMovEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm_mask_mov_epi16'. Requires AVX512BW.

func MaskMovEpi8 ¶

func MaskMovEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm_mask_mov_epi8'. Requires AVX512BW.

func MaskMulhiEpi16 ¶

func MaskMulhiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm_mask_mulhi_epi16'. Requires AVX512BW.

func MaskMulhiEpu16 ¶

func MaskMulhiEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm_mask_mulhi_epu16'. Requires AVX512BW.

func MaskMulhrsEpi16 ¶

func MaskMulhrsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm_mask_mulhrs_epi16'. Requires AVX512BW.

func MaskMulloEpi16 ¶

func MaskMulloEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm_mask_mullo_epi16'. Requires AVX512BW.

func MaskPacksEpi16 ¶

func MaskPacksEpi16(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm_mask_packs_epi16'. Requires AVX512BW.

func MaskPacksEpi32 ¶

func MaskPacksEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm_mask_packs_epi32'. Requires AVX512BW.

func MaskPackusEpi16 ¶

func MaskPackusEpi16(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm_mask_packus_epi16'. Requires AVX512BW.

func MaskPackusEpi32 ¶

func MaskPackusEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm_mask_packus_epi32'. Requires AVX512BW.

func MaskPermutex2varEpi16 ¶

func MaskPermutex2varEpi16(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		off := 16*idx[i+2:i]
		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMT2W'. Intrinsic: '_mm_mask_permutex2var_epi16'. Requires AVX512BW.

func MaskPermutexvarEpi16 ¶

func MaskPermutexvarEpi16(src x86.M128i, k x86.Mmask8, idx x86.M128i, a x86.M128i) (dst x86.M128i)

MaskPermutexvarEpi16: Shuffle 16-bit integers in 'a' using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	id := idx[i+2:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm_mask_permutexvar_epi16'. Requires AVX512BW.

func MaskSet1Epi16 ¶

func MaskSet1Epi16(src x86.M128i, k x86.Mmask8, a int16) (dst x86.M128i)

MaskSet1Epi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm_mask_set1_epi16'. Requires AVX512BW.

func MaskSet1Epi8 ¶

func MaskSet1Epi8(src x86.M128i, k x86.Mmask16, a byte) (dst x86.M128i)

MaskSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm_mask_set1_epi8'. Requires AVX512BW.

func MaskShuffleEpi8 ¶

func MaskShuffleEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm_mask_shuffle_epi8'. Requires AVX512BW.

func MaskShufflehiEpi16 ¶

func MaskShufflehiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 'dst', with the low 64 bits being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm_mask_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskShuffleloEpi16 ¶

func MaskShuffleloEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 'dst', with the high 64 bits being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm_mask_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskSllEpi16 ¶

func MaskSllEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm_mask_sll_epi16'. Requires AVX512BW.

func MaskSlliEpi16 ¶

func MaskSlliEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm_mask_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskSllvEpi16 ¶

func MaskSllvEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm_mask_sllv_epi16'. Requires AVX512BW.

func MaskSraEpi16 ¶

func MaskSraEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm_mask_sra_epi16'. Requires AVX512BW.

func MaskSraiEpi16 ¶

func MaskSraiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm_mask_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskSravEpi16 ¶

func MaskSravEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm_mask_srav_epi16'. Requires AVX512BW.

func MaskSrlEpi16 ¶

func MaskSrlEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm_mask_srl_epi16'. Requires AVX512BW.

func MaskSrliEpi16 ¶

func MaskSrliEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm_mask_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskSrlvEpi16 ¶

func MaskSrlvEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm_mask_srlv_epi16'. Requires AVX512BW.

func MaskSubEpi16 ¶

func MaskSubEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm_mask_sub_epi16'. Requires AVX512BW.

func MaskSubEpi8 ¶

func MaskSubEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm_mask_sub_epi8'. Requires AVX512BW.

func MaskSubsEpi16 ¶

func MaskSubsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm_mask_subs_epi16'. Requires AVX512BW.

func MaskSubsEpi8 ¶

func MaskSubsEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm_mask_subs_epi8'. Requires AVX512BW.

func MaskSubsEpu16 ¶

func MaskSubsEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm_mask_subs_epu16'. Requires AVX512BW.

func MaskSubsEpu8 ¶

func MaskSubsEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm_mask_subs_epu8'. Requires AVX512BW.

func MaskTestEpi16Mask ¶

func MaskTestEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm_mask_test_epi16_mask'. Requires AVX512BW.

func MaskTestEpi8Mask ¶

func MaskTestEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskTestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm_mask_test_epi8_mask'. Requires AVX512BW.

func MaskTestnEpi16Mask ¶

func MaskTestnEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm_mask_testn_epi16_mask'. Requires AVX512BW.

func MaskTestnEpi8Mask ¶

func MaskTestnEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskTestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm_mask_testn_epi8_mask'. Requires AVX512BW.

func MaskUnpackhiEpi16 ¶

func MaskUnpackhiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm_mask_unpackhi_epi16'. Requires AVX512BW.

func MaskUnpackhiEpi8 ¶

func MaskUnpackhiEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm_mask_unpackhi_epi8'. Requires AVX512BW.

func MaskUnpackloEpi16 ¶

func MaskUnpackloEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm_mask_unpacklo_epi16'. Requires AVX512BW.

func MaskUnpackloEpi8 ¶

func MaskUnpackloEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm_mask_unpacklo_epi8'. Requires AVX512BW.

func MaskzAbsEpi16 ¶

func MaskzAbsEpi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm_maskz_abs_epi16'. Requires AVX512BW.

func MaskzAbsEpi8 ¶

func MaskzAbsEpi8(k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskzAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm_maskz_abs_epi8'. Requires AVX512BW.

func MaskzAddEpi16 ¶

func MaskzAddEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm_maskz_add_epi16'. Requires AVX512BW.

func MaskzAddEpi8 ¶

func MaskzAddEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm_maskz_add_epi8'. Requires AVX512BW.

func MaskzAddsEpi16 ¶

func MaskzAddsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm_maskz_adds_epi16'. Requires AVX512BW.

func MaskzAddsEpi8 ¶

func MaskzAddsEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm_maskz_adds_epi8'. Requires AVX512BW.

func MaskzAddsEpu16 ¶

func MaskzAddsEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm_maskz_adds_epu16'. Requires AVX512BW.

func MaskzAddsEpu8 ¶

func MaskzAddsEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm_maskz_adds_epu8'. Requires AVX512BW.

func MaskzAlignrEpi8 ¶

func MaskzAlignrEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i, count int) (dst x86.M128i)

MaskzAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8)

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm_maskz_alignr_epi8'. Requires AVX512BW.

func MaskzAvgEpu16 ¶

func MaskzAvgEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm_maskz_avg_epu16'. Requires AVX512BW.

func MaskzAvgEpu8 ¶

func MaskzAvgEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm_maskz_avg_epu8'. Requires AVX512BW.

func MaskzBroadcastbEpi8 ¶

func MaskzBroadcastbEpi8(k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskzBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm_maskz_broadcastb_epi8'. Requires AVX512BW.

func MaskzBroadcastwEpi16 ¶

func MaskzBroadcastwEpi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm_maskz_broadcastw_epi16'. Requires AVX512BW.

func MaskzCvtepi16Epi8 ¶

func MaskzCvtepi16Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm_maskz_cvtepi16_epi8'. Requires AVX512BW.

func MaskzCvtepi8Epi16 ¶

func MaskzCvtepi8Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm_maskz_cvtepi8_epi16'. Requires AVX512BW.

func MaskzCvtepu8Epi16 ¶

func MaskzCvtepu8Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm_maskz_cvtepu8_epi16'. Requires AVX512BW.

func MaskzCvtsepi16Epi8 ¶

func MaskzCvtsepi16Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm_maskz_cvtsepi16_epi8'. Requires AVX512BW.

func MaskzCvtusepi16Epi8 ¶

func MaskzCvtusepi16Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm_maskz_cvtusepi16_epi8'. Requires AVX512BW.

func MaskzDbsadEpu8 ¶

func MaskzDbsadEpu8(k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

MaskzDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

tmp[31:0] := select(b[127:0], imm8[1:0])
tmp[63:32] := select(b[127:0], imm8[3:2])
tmp[95:64] := select(b[127:0], imm8[5:4])
tmp[127:96] := select(b[127:0], imm8[7:6])

FOR j := 0 to 1
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm_maskz_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzMaddEpi16 ¶

func MaskzMaddEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm_maskz_madd_epi16'. Requires AVX512BW.

func MaskzMaddubsEpi16 ¶

func MaskzMaddubsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm_maskz_maddubs_epi16'. Requires AVX512BW.

func MaskzMaxEpi16 ¶

func MaskzMaxEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm_maskz_max_epi16'. Requires AVX512BW.

func MaskzMaxEpi8 ¶

func MaskzMaxEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm_maskz_max_epi8'. Requires AVX512BW.

func MaskzMaxEpu16 ¶

func MaskzMaxEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm_maskz_max_epu16'. Requires AVX512BW.

func MaskzMaxEpu8 ¶

func MaskzMaxEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm_maskz_max_epu8'. Requires AVX512BW.

func MaskzMinEpi16 ¶

func MaskzMinEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm_maskz_min_epi16'. Requires AVX512BW.

func MaskzMinEpi8 ¶

func MaskzMinEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm_maskz_min_epi8'. Requires AVX512BW.

func MaskzMinEpu16 ¶

func MaskzMinEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm_maskz_min_epu16'. Requires AVX512BW.

func MaskzMinEpu8 ¶

func MaskzMinEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm_maskz_min_epu8'. Requires AVX512BW.

func MaskzMovEpi16 ¶

func MaskzMovEpi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm_maskz_mov_epi16'. Requires AVX512BW.

func MaskzMovEpi8 ¶

func MaskzMovEpi8(k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskzMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm_maskz_mov_epi8'. Requires AVX512BW.

func MaskzMulhiEpi16 ¶

func MaskzMulhiEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm_maskz_mulhi_epi16'. Requires AVX512BW.

func MaskzMulhiEpu16 ¶

func MaskzMulhiEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := o
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm_maskz_mulhi_epu16'. Requires AVX512BW.

func MaskzMulhrsEpi16 ¶

func MaskzMulhrsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := 9
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm_maskz_mulhrs_epi16'. Requires AVX512BW.

func MaskzMulloEpi16 ¶

func MaskzMulloEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm_maskz_mullo_epi16'. Requires AVX512BW.

func MaskzPacksEpi16 ¶

func MaskzPacksEpi16(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm_maskz_packs_epi16'. Requires AVX512BW.

func MaskzPacksEpi32 ¶

func MaskzPacksEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm_maskz_packs_epi32'. Requires AVX512BW.

func MaskzPackusEpi16 ¶

func MaskzPackusEpi16(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm_maskz_packus_epi16'. Requires AVX512BW.

func MaskzPackusEpi32 ¶

func MaskzPackusEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm_maskz_packus_epi32'. Requires AVX512BW.

func MaskzPermutex2varEpi16 ¶

func MaskzPermutex2varEpi16(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		off := 16*idx[i+2:i]
		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm_maskz_permutex2var_epi16'. Requires AVX512BW.

func MaskzPermutexvarEpi16 ¶

func MaskzPermutexvarEpi16(k x86.Mmask8, idx x86.M128i, a x86.M128i) (dst x86.M128i)

MaskzPermutexvarEpi16: Shuffle 16-bit integers in 'a' using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	id := idx[i+2:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm_maskz_permutexvar_epi16'. Requires AVX512BW.

func MaskzSet1Epi16 ¶

func MaskzSet1Epi16(k x86.Mmask8, a int16) (dst x86.M128i)

MaskzSet1Epi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm_maskz_set1_epi16'. Requires AVX512BW.

func MaskzSet1Epi8 ¶

func MaskzSet1Epi8(k x86.Mmask16, a byte) (dst x86.M128i)

MaskzSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm_maskz_set1_epi8'. Requires AVX512BW.

func MaskzShuffleEpi8 ¶

func MaskzShuffleEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm_maskz_shuffle_epi8'. Requires AVX512BW.

func MaskzShufflehiEpi16 ¶

func MaskzShufflehiEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 'dst', with the low 64 bits being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm_maskz_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzShuffleloEpi16 ¶

func MaskzShuffleloEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 'dst', with the high 64 bits being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm_maskz_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzSllEpi16 ¶

func MaskzSllEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm_maskz_sll_epi16'. Requires AVX512BW.

func MaskzSlliEpi16 ¶

func MaskzSlliEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm_maskz_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzSllvEpi16 ¶

func MaskzSllvEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm_maskz_sllv_epi16'. Requires AVX512BW.

func MaskzSraEpi16 ¶

func MaskzSraEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm_maskz_sra_epi16'. Requires AVX512BW.

func MaskzSraiEpi16 ¶

func MaskzSraiEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm_maskz_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzSravEpi16 ¶

func MaskzSravEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm_maskz_srav_epi16'. Requires AVX512BW.

func MaskzSrlEpi16 ¶

func MaskzSrlEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm_maskz_srl_epi16'. Requires AVX512BW.

func MaskzSrliEpi16 ¶

func MaskzSrliEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm_maskz_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzSrlvEpi16 ¶

func MaskzSrlvEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm_maskz_srlv_epi16'. Requires AVX512BW.

func MaskzSubEpi16 ¶

func MaskzSubEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm_maskz_sub_epi16'. Requires AVX512BW.

func MaskzSubEpi8 ¶

func MaskzSubEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm_maskz_sub_epi8'. Requires AVX512BW.

func MaskzSubsEpi16 ¶

func MaskzSubsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm_maskz_subs_epi16'. Requires AVX512BW.

func MaskzSubsEpi8 ¶

func MaskzSubsEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm_maskz_subs_epi8'. Requires AVX512BW.

func MaskzSubsEpu16 ¶

func MaskzSubsEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm_maskz_subs_epu16'. Requires AVX512BW.

func MaskzSubsEpu8 ¶

func MaskzSubsEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm_maskz_subs_epu8'. Requires AVX512BW.

func MaskzUnpackhiEpi16 ¶

func MaskzUnpackhiEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm_maskz_unpackhi_epi16'. Requires AVX512BW.

func MaskzUnpackhiEpi8 ¶

func MaskzUnpackhiEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm_maskz_unpackhi_epi8'. Requires AVX512BW.

func MaskzUnpackloEpi16 ¶

func MaskzUnpackloEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm_maskz_unpacklo_epi16'. Requires AVX512BW.

func MaskzUnpackloEpi8 ¶

func MaskzUnpackloEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm_maskz_unpacklo_epi8'. Requires AVX512BW.

func Movepi16Mask ¶

func Movepi16Mask(a x86.M128i) (dst x86.Mmask8)

Movepi16Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 16-bit integer in 'a'.

FOR j := 0 to 7
	i := j*16
	IF a[i+15]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPMOVW2M'. Intrinsic: '_mm_movepi16_mask'. Requires AVX512BW.

func Movepi8Mask ¶

func Movepi8Mask(a x86.M128i) (dst x86.Mmask16)

Movepi8Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 8-bit integer in 'a'.

FOR j := 0 to 15
	i := j*8
	IF a[i+7]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPMOVB2M'. Intrinsic: '_mm_movepi8_mask'. Requires AVX512BW.

func MovmEpi16 ¶

func MovmEpi16(k x86.Mmask8) (dst x86.M128i)

MovmEpi16: Set each packed 16-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := 0xFFFF
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVM2W'. Intrinsic: '_mm_movm_epi16'. Requires AVX512BW.

func Permutex2varEpi16 ¶

func Permutex2varEpi16(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	off := 16*idx[i+2:i]
	dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm_permutex2var_epi16'. Requires AVX512BW.

func PermutexvarEpi16 ¶

func PermutexvarEpi16(idx x86.M128i, a x86.M128i) (dst x86.M128i)

PermutexvarEpi16: Shuffle 16-bit integers in 'a' using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	id := idx[i+2:i]*16
	dst[i+15:i] := a[id+15:id]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm_permutexvar_epi16'. Requires AVX512BW.

func SllvEpi16 ¶

func SllvEpi16(a x86.M128i, count x86.M128i) (dst x86.M128i)

SllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm_sllv_epi16'. Requires AVX512BW.

func SravEpi16 ¶

func SravEpi16(a x86.M128i, count x86.M128i) (dst x86.M128i)

SravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm_srav_epi16'. Requires AVX512BW.

func SrlvEpi16 ¶

func SrlvEpi16(a x86.M128i, count x86.M128i) (dst x86.M128i)

SrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm_srlv_epi16'. Requires AVX512BW.

func TestEpi16Mask ¶

func TestEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm_test_epi16_mask'. Requires AVX512BW.

func TestEpi8Mask ¶

func TestEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

TestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 15
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm_test_epi8_mask'. Requires AVX512BW.

func TestnEpi16Mask ¶

func TestnEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 7
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm_testn_epi16_mask'. Requires AVX512BW.

func TestnEpi8Mask ¶

func TestnEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

TestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 15
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm_testn_epi8_mask'. Requires AVX512BW.

Types ¶

This section is empty.

Source Files ¶

View all Source files

avx512bw.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL