avx512dq

package

v0.0.0-...-3878f85 Latest Latest Go to latest Published: Jul 23, 2017 License: MIT Imports: 1 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/klauspost/intrinsics

Links

Open Source Insights

Documentation ¶

Overview ¶

THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!

THEY DO NOT NOT CONTAIN WORKING INTRINSICS!

See https://github.com/klauspost/intrinsics

Index ¶

func BroadcastI32x2(a x86.M128i) (dst x86.M128i)
func Cvtepi64Pd(a x86.M128i) (dst x86.M128d)
func Cvtepi64Ps(a x86.M128i) (dst x86.M128)
func Cvtepu64Pd(a x86.M128i) (dst x86.M128d)
func Cvtepu64Ps(a x86.M128i) (dst x86.M128)
func CvtpdEpi64(a x86.M128d) (dst x86.M128i)
func CvtpdEpu64(a x86.M128d) (dst x86.M128i)
func CvtpsEpi64(a x86.M128) (dst x86.M128i)
func CvtpsEpu64(a x86.M128) (dst x86.M128i)
func CvttpdEpi64(a x86.M128d) (dst x86.M128i)
func CvttpdEpu64(a x86.M128d) (dst x86.M128i)
func CvttpsEpi64(a x86.M128) (dst x86.M128i)
func CvttpsEpu64(a x86.M128) (dst x86.M128i)
func FpclassPdMask(a x86.M128d, imm8 byte) (dst x86.Mmask8)
func FpclassPsMask(a x86.M128, imm8 byte) (dst x86.Mmask8)
func FpclassSdMask(a x86.M128d, imm8 byte) (dst x86.Mmask8)
func FpclassSsMask(a x86.M128, imm8 byte) (dst x86.Mmask8)
func M256BroadcastF32x2(a x86.M128) (dst x86.M256)
func M256BroadcastF64x2(a x86.M128d) (dst x86.M256d)
func M256BroadcastI32x2(a x86.M128i) (dst x86.M256i)
func M256BroadcastI64x2(a x86.M128i) (dst x86.M256i)
func M256Cvtepi64Pd(a x86.M256i) (dst x86.M256d)
func M256Cvtepi64Ps(a x86.M256i) (dst x86.M128)
func M256Cvtepu64Pd(a x86.M256i) (dst x86.M256d)
func M256Cvtepu64Ps(a x86.M256i) (dst x86.M128)
func M256CvtpdEpi64(a x86.M256d) (dst x86.M256i)
func M256CvtpdEpu64(a x86.M256d) (dst x86.M256i)
func M256CvtpsEpi64(a x86.M128) (dst x86.M256i)
func M256CvtpsEpu64(a x86.M128) (dst x86.M256i)
func M256CvttpdEpi64(a x86.M256d) (dst x86.M256i)
func M256CvttpdEpu64(a x86.M256d) (dst x86.M256i)
func M256CvttpsEpi64(a x86.M128) (dst x86.M256i)
func M256CvttpsEpu64(a x86.M128) (dst x86.M256i)
func M256Extractf64x2Pd(a x86.M256d, imm8 byte) (dst x86.M128d)
func M256Extracti64x2Epi64(a x86.M256i, imm8 byte) (dst x86.M128i)
func M256FpclassPdMask(a x86.M256d, imm8 byte) (dst x86.Mmask8)
func M256FpclassPsMask(a x86.M256, imm8 byte) (dst x86.Mmask8)
func M256Insertf64x2(a x86.M256d, b x86.M128d, imm8 byte) (dst x86.M256d)
func M256Inserti64x2(a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
func M256MaskAndPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskAndPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskAndnotPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskAndnotPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskBroadcastF32x2(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)
func M256MaskBroadcastF64x2(src x86.M256d, k x86.Mmask8, a x86.M128d) (dst x86.M256d)
func M256MaskBroadcastI32x2(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskBroadcastI64x2(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskCvtepi64Pd(src x86.M256d, k x86.Mmask8, a x86.M256i) (dst x86.M256d)
func M256MaskCvtepi64Ps(src x86.M128, k x86.Mmask8, a x86.M256i) (dst x86.M128)
func M256MaskCvtepu64Pd(src x86.M256d, k x86.Mmask8, a x86.M256i) (dst x86.M256d)
func M256MaskCvtepu64Ps(src x86.M128, k x86.Mmask8, a x86.M256i) (dst x86.M128)
func M256MaskCvtpdEpi64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)
func M256MaskCvtpdEpu64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)
func M256MaskCvtpsEpi64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)
func M256MaskCvtpsEpu64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)
func M256MaskCvttpdEpi64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)
func M256MaskCvttpdEpu64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)
func M256MaskCvttpsEpi64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)
func M256MaskCvttpsEpu64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)
func M256MaskExtractf64x2Pd(src x86.M128d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M128d)
func M256MaskExtracti64x2Epi64(src x86.M128i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)
func M256MaskFpclassPdMask(k1 x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.Mmask8)
func M256MaskFpclassPsMask(k1 x86.Mmask8, a x86.M256, imm8 byte) (dst x86.Mmask8)
func M256MaskInsertf64x2(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M128d, imm8 byte) (dst x86.M256d)
func M256MaskInserti64x2(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
func M256MaskMulloEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskOrPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskOrPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskRangePd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskRangePs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
func M256MaskReducePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskReducePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)
func M256MaskXorPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskXorPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzAndPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzAndPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzAndnotPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzAndnotPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzBroadcastF32x2(k x86.Mmask8, a x86.M128) (dst x86.M256)
func M256MaskzBroadcastF64x2(k x86.Mmask8, a x86.M128d) (dst x86.M256d)
func M256MaskzBroadcastI32x2(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzBroadcastI64x2(k x86.Mmask8, a x86.M128i) (dst x86.M256i)
func M256MaskzCvtepi64Pd(k x86.Mmask8, a x86.M256i) (dst x86.M256d)
func M256MaskzCvtepi64Ps(k x86.Mmask8, a x86.M256i) (dst x86.M128)
func M256MaskzCvtepu64Pd(k x86.Mmask8, a x86.M256i) (dst x86.M256d)
func M256MaskzCvtepu64Ps(k x86.Mmask8, a x86.M256i) (dst x86.M128)
func M256MaskzCvtpdEpi64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)
func M256MaskzCvtpdEpu64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)
func M256MaskzCvtpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M256i)
func M256MaskzCvtpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M256i)
func M256MaskzCvttpdEpi64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)
func M256MaskzCvttpdEpu64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)
func M256MaskzCvttpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M256i)
func M256MaskzCvttpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M256i)
func M256MaskzExtractf64x2Pd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M128d)
func M256MaskzExtracti64x2Epi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)
func M256MaskzInsertf64x2(k x86.Mmask8, a x86.M256d, b x86.M128d, imm8 byte) (dst x86.M256d)
func M256MaskzInserti64x2(k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
func M256MaskzMulloEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskzOrPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzOrPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256MaskzRangePd(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskzRangePs(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
func M256MaskzReducePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)
func M256MaskzReducePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)
func M256MaskzXorPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)
func M256MaskzXorPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)
func M256Movepi32Mask(a x86.M256i) (dst x86.Mmask8)
func M256Movepi64Mask(a x86.M256i) (dst x86.Mmask8)
func M256MovmEpi32(k x86.Mmask8) (dst x86.M256i)
func M256MovmEpi64(k x86.Mmask8) (dst x86.M256i)
func M256MulloEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256RangePd(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)
func M256RangePs(a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)
func M256ReducePd(a x86.M256d, imm8 byte) (dst x86.M256d)
func M256ReducePs(a x86.M256, imm8 byte) (dst x86.M256)
func M512AndPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512AndPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512AndnotPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512AndnotPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512BroadcastF32x2(a x86.M128) (dst x86.M512)
func M512BroadcastF32x8(a x86.M256) (dst x86.M512)
func M512BroadcastF64x2(a x86.M128d) (dst x86.M512d)
func M512BroadcastI32x2(a x86.M128i) (dst x86.M512i)
func M512BroadcastI32x8(a x86.M256i) (dst x86.M512i)
func M512BroadcastI64x2(a x86.M128i) (dst x86.M512i)
func M512CvtRoundepi64Pd(a x86.M512i, rounding int) (dst x86.M512d)
func M512CvtRoundepi64Ps(a x86.M512i, rounding int) (dst x86.M256)
func M512CvtRoundepu64Pd(a x86.M512i, rounding int) (dst x86.M512d)
func M512CvtRoundepu64Ps(a x86.M512i, rounding int) (dst x86.M256)
func M512CvtRoundpdEpi64(a x86.M512d, rounding int) (dst x86.M512i)
func M512CvtRoundpdEpu64(a x86.M512d, rounding int) (dst x86.M512i)
func M512CvtRoundpsEpi64(a x86.M256, rounding int) (dst x86.M512i)
func M512CvtRoundpsEpu64(a x86.M256, rounding int) (dst x86.M512i)
func M512Cvtepi64Pd(a x86.M512i) (dst x86.M512d)
func M512Cvtepi64Ps(a x86.M512i) (dst x86.M256)
func M512Cvtepu64Pd(a x86.M512i) (dst x86.M512d)
func M512Cvtepu64Ps(a x86.M512i) (dst x86.M256)
func M512CvtpdEpi64(a x86.M512d) (dst x86.M512i)
func M512CvtpdEpu64(a x86.M512d) (dst x86.M512i)
func M512CvtpsEpi64(a x86.M256) (dst x86.M512i)
func M512CvtpsEpu64(a x86.M256) (dst x86.M512i)
func M512CvttRoundpdEpi64(a x86.M512d, sae int) (dst x86.M512i)
func M512CvttRoundpdEpu64(a x86.M512d, sae int) (dst x86.M512i)
func M512CvttRoundpsEpi64(a x86.M256, sae int) (dst x86.M512i)
func M512CvttRoundpsEpu64(a x86.M256, sae int) (dst x86.M512i)
func M512CvttpdEpi64(a x86.M512d) (dst x86.M512i)
func M512CvttpdEpu64(a x86.M512d) (dst x86.M512i)
func M512CvttpsEpi64(a x86.M256) (dst x86.M512i)
func M512CvttpsEpu64(a x86.M256) (dst x86.M512i)
func M512Extractf32x8Ps(a x86.M512, imm8 byte) (dst x86.M256)
func M512Extractf64x2Pd(a x86.M512d, imm8 byte) (dst x86.M128d)
func M512Extracti32x8Epi32(a x86.M512i, imm8 byte) (dst x86.M256i)
func M512Extracti64x2Epi64(a x86.M512i, imm8 byte) (dst x86.M128i)
func M512FpclassPdMask(a x86.M512d, imm8 byte) (dst x86.Mmask8)
func M512FpclassPsMask(a x86.M512, imm8 byte) (dst x86.Mmask16)
func M512Insertf32x8(a x86.M512, b x86.M256, imm8 byte) (dst x86.M512)
func M512Insertf64x2(a x86.M512d, b x86.M128d, imm8 byte) (dst x86.M512d)
func M512Inserti32x8(a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)
func M512Inserti64x2(a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)
func M512MaskAndPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskAndPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskAndnotPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskAndnotPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskBroadcastF32x2(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)
func M512MaskBroadcastF32x8(src x86.M512, k x86.Mmask16, a x86.M256) (dst x86.M512)
func M512MaskBroadcastF64x2(src x86.M512d, k x86.Mmask8, a x86.M128d) (dst x86.M512d)
func M512MaskBroadcastI32x2(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)
func M512MaskBroadcastI32x8(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)
func M512MaskBroadcastI64x2(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskCvtRoundepi64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)
func M512MaskCvtRoundepi64Ps(src x86.M256, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)
func M512MaskCvtRoundepu64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)
func M512MaskCvtRoundepu64Ps(src x86.M256, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)
func M512MaskCvtRoundpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)
func M512MaskCvtRoundpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)
func M512MaskCvtRoundpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)
func M512MaskCvtRoundpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)
func M512MaskCvtepi64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i) (dst x86.M512d)
func M512MaskCvtepi64Ps(src x86.M256, k x86.Mmask8, a x86.M512i) (dst x86.M256)
func M512MaskCvtepu64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i) (dst x86.M512d)
func M512MaskCvtepu64Ps(src x86.M256, k x86.Mmask8, a x86.M512i) (dst x86.M256)
func M512MaskCvtpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)
func M512MaskCvtpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)
func M512MaskCvtpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)
func M512MaskCvtpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)
func M512MaskCvttRoundpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)
func M512MaskCvttRoundpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)
func M512MaskCvttRoundpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)
func M512MaskCvttRoundpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)
func M512MaskCvttpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)
func M512MaskCvttpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)
func M512MaskCvttpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)
func M512MaskCvttpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)
func M512MaskExtractf32x8Ps(src x86.M256, k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M256)
func M512MaskExtractf64x2Pd(src x86.M128d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M128d)
func M512MaskExtracti32x8Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)
func M512MaskExtracti64x2Epi64(src x86.M128i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)
func M512MaskFpclassPdMask(k1 x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.Mmask8)
func M512MaskFpclassPsMask(k1 x86.Mmask16, a x86.M512, imm8 byte) (dst x86.Mmask16)
func M512MaskInsertf32x8(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M256, imm8 byte) (dst x86.M512)
func M512MaskInsertf64x2(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M128d, imm8 byte) (dst x86.M512d)
func M512MaskInserti32x8(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)
func M512MaskInserti64x2(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)
func M512MaskMulloEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskOrPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskOrPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskRangePd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskRangePs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
func M512MaskRangeRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
func M512MaskRangeRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte, rounding int) (dst x86.M512)
func M512MaskReducePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskReducePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
func M512MaskReduceRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
func M512MaskReduceRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)
func M512MaskXorPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskXorPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzAndPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzAndPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzAndnotPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzAndnotPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzBroadcastF32x2(k x86.Mmask16, a x86.M128) (dst x86.M512)
func M512MaskzBroadcastF32x8(k x86.Mmask16, a x86.M256) (dst x86.M512)
func M512MaskzBroadcastF64x2(k x86.Mmask8, a x86.M128d) (dst x86.M512d)
func M512MaskzBroadcastI32x2(k x86.Mmask16, a x86.M128i) (dst x86.M512i)
func M512MaskzBroadcastI32x8(k x86.Mmask16, a x86.M256i) (dst x86.M512i)
func M512MaskzBroadcastI64x2(k x86.Mmask8, a x86.M128i) (dst x86.M512i)
func M512MaskzCvtRoundepi64Pd(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)
func M512MaskzCvtRoundepi64Ps(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)
func M512MaskzCvtRoundepu64Pd(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)
func M512MaskzCvtRoundepu64Ps(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)
func M512MaskzCvtRoundpdEpi64(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)
func M512MaskzCvtRoundpdEpu64(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)
func M512MaskzCvtRoundpsEpi64(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)
func M512MaskzCvtRoundpsEpu64(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)
func M512MaskzCvtepi64Pd(k x86.Mmask8, a x86.M512i) (dst x86.M512d)
func M512MaskzCvtepi64Ps(k x86.Mmask8, a x86.M512i) (dst x86.M256)
func M512MaskzCvtepu64Pd(k x86.Mmask8, a x86.M512i) (dst x86.M512d)
func M512MaskzCvtepu64Ps(k x86.Mmask8, a x86.M512i) (dst x86.M256)
func M512MaskzCvtpdEpi64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)
func M512MaskzCvtpdEpu64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)
func M512MaskzCvtpsEpi64(k x86.Mmask8, a x86.M256) (dst x86.M512i)
func M512MaskzCvtpsEpu64(k x86.Mmask8, a x86.M256) (dst x86.M512i)
func M512MaskzCvttRoundpdEpi64(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)
func M512MaskzCvttRoundpdEpu64(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)
func M512MaskzCvttRoundpsEpi64(k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)
func M512MaskzCvttRoundpsEpu64(k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)
func M512MaskzCvttpdEpi64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)
func M512MaskzCvttpdEpu64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)
func M512MaskzCvttpsEpi64(k x86.Mmask8, a x86.M256) (dst x86.M512i)
func M512MaskzCvttpsEpu64(k x86.Mmask8, a x86.M256) (dst x86.M512i)
func M512MaskzExtractf32x8Ps(k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M256)
func M512MaskzExtractf64x2Pd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M128d)
func M512MaskzExtracti32x8Epi32(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)
func M512MaskzExtracti64x2Epi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)
func M512MaskzInsertf32x8(k x86.Mmask16, a x86.M512, b x86.M256, imm8 byte) (dst x86.M512)
func M512MaskzInsertf64x2(k x86.Mmask8, a x86.M512d, b x86.M128d, imm8 byte) (dst x86.M512d)
func M512MaskzInserti32x8(k x86.Mmask16, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)
func M512MaskzInserti64x2(k x86.Mmask8, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)
func M512MaskzMulloEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskzOrPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzOrPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskzRangePd(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskzRangePs(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
func M512MaskzRangeRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
func M512MaskzRangeRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte, rounding int) (dst x86.M512)
func M512MaskzReducePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)
func M512MaskzReducePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
func M512MaskzReduceRoundPd(k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
func M512MaskzReduceRoundPs(k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)
func M512MaskzXorPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskzXorPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512Movepi32Mask(a x86.M512i) (dst x86.Mmask16)
func M512Movepi64Mask(a x86.M512i) (dst x86.Mmask8)
func M512MovmEpi32(k x86.Mmask16) (dst x86.M512i)
func M512MovmEpi64(k x86.Mmask8) (dst x86.M512i)
func M512MulloEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512OrPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512OrPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512RangePd(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)
func M512RangePs(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)
func M512RangeRoundPd(a x86.M512d, b x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
func M512RangeRoundPs(a x86.M512, b x86.M512, imm8 byte, rounding int) (dst x86.M512)
func M512ReducePd(a x86.M512d, imm8 byte) (dst x86.M512d)
func M512ReducePs(a x86.M512, imm8 byte) (dst x86.M512)
func M512ReduceRoundPd(a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)
func M512ReduceRoundPs(a x86.M512, imm8 byte, rounding int) (dst x86.M512)
func M512XorPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512XorPs(a x86.M512, b x86.M512) (dst x86.M512)
func MaskAndPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskAndPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskAndnotPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskAndnotPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskBroadcastI32x2(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskCvtepi64Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)
func MaskCvtepi64Ps(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)
func MaskCvtepu64Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)
func MaskCvtepu64Ps(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)
func MaskCvtpdEpi64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskCvtpdEpu64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskCvtpsEpi64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskCvtpsEpu64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskCvttpdEpi64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskCvttpdEpu64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskCvttpsEpi64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskCvttpsEpu64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskFpclassPdMask(k1 x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.Mmask8)
func MaskFpclassPsMask(k1 x86.Mmask8, a x86.M128, imm8 byte) (dst x86.Mmask8)
func MaskFpclassSdMask(k1 x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.Mmask8)
func MaskFpclassSsMask(k1 x86.Mmask8, a x86.M128, imm8 byte) (dst x86.Mmask8)
func MaskMulloEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskOrPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskOrPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskRangePd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func MaskRangePs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func MaskRangeRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
func MaskRangeRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
func MaskRangeSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func MaskRangeSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func MaskReducePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)
func MaskReducePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)
func MaskReduceRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
func MaskReduceRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
func MaskReduceSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func MaskReduceSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func MaskXorPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskXorPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzAndPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzAndPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzAndnotPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzAndnotPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzBroadcastI32x2(k x86.Mmask8, a x86.M128i) (dst x86.M128i)
func MaskzCvtepi64Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)
func MaskzCvtepi64Ps(k x86.Mmask8, a x86.M128i) (dst x86.M128)
func MaskzCvtepu64Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)
func MaskzCvtepu64Ps(k x86.Mmask8, a x86.M128i) (dst x86.M128)
func MaskzCvtpdEpi64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskzCvtpdEpu64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskzCvtpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskzCvtpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskzCvttpdEpi64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskzCvttpdEpu64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)
func MaskzCvttpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskzCvttpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M128i)
func MaskzMulloEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)
func MaskzOrPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzOrPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func MaskzRangePd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func MaskzRangePs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func MaskzRangeRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
func MaskzRangeRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
func MaskzRangeSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func MaskzRangeSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func MaskzReducePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)
func MaskzReducePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)
func MaskzReduceRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
func MaskzReduceRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
func MaskzReduceSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func MaskzReduceSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func MaskzXorPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)
func MaskzXorPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)
func Movepi32Mask(a x86.M128i) (dst x86.Mmask8)
func Movepi64Mask(a x86.M128i) (dst x86.Mmask8)
func MovmEpi32(k x86.Mmask8) (dst x86.M128i)
func MovmEpi64(k x86.Mmask8) (dst x86.M128i)
func MulloEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RangePd(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func RangePs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func RangeRoundSd(a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
func RangeRoundSs(a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
func ReducePd(a x86.M128d, imm8 byte) (dst x86.M128d)
func ReducePs(a x86.M128, imm8 byte) (dst x86.M128)
func ReduceRoundSd(a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)
func ReduceRoundSs(a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)
func ReduceSd(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)
func ReduceSs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func BroadcastI32x2 ¶

func BroadcastI32x2(a x86.M128i) (dst x86.M128i)

BroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of "dst.

FOR j := 0 to 3
	i := j*32
	n := (j mod 2)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm_broadcast_i32x2'. Requires AVX512DQ.

func Cvtepi64Pd ¶

func Cvtepi64Pd(a x86.M128i) (dst x86.M128d)

Cvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm_cvtepi64_pd'. Requires AVX512DQ.

func Cvtepi64Ps ¶

func Cvtepi64Ps(a x86.M128i) (dst x86.M128)

Cvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm_cvtepi64_ps'. Requires AVX512DQ.

func Cvtepu64Pd ¶

func Cvtepu64Pd(a x86.M128i) (dst x86.M128d)

Cvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm_cvtepu64_pd'. Requires AVX512DQ.

func Cvtepu64Ps ¶

func Cvtepu64Ps(a x86.M128i) (dst x86.M128)

Cvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm_cvtepu64_ps'. Requires AVX512DQ.

func CvtpdEpi64 ¶

func CvtpdEpi64(a x86.M128d) (dst x86.M128i)

CvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm_cvtpd_epi64'. Requires AVX512DQ.

func CvtpdEpu64 ¶

func CvtpdEpu64(a x86.M128d) (dst x86.M128i)

CvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm_cvtpd_epu64'. Requires AVX512DQ.

func CvtpsEpi64 ¶

func CvtpsEpi64(a x86.M128) (dst x86.M128i)

CvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm_cvtps_epi64'. Requires AVX512DQ.

func CvtpsEpu64 ¶

func CvtpsEpu64(a x86.M128) (dst x86.M128i)

CvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm_cvtps_epu64'. Requires AVX512DQ.

func CvttpdEpi64 ¶

func CvttpdEpi64(a x86.M128d) (dst x86.M128i)

CvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm_cvttpd_epi64'. Requires AVX512DQ.

func CvttpdEpu64 ¶

func CvttpdEpu64(a x86.M128d) (dst x86.M128i)

CvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm_cvttpd_epu64'. Requires AVX512DQ.

func CvttpsEpi64 ¶

func CvttpsEpi64(a x86.M128) (dst x86.M128i)

CvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm_cvttps_epi64'. Requires AVX512DQ.

func CvttpsEpu64 ¶

func CvttpsEpu64(a x86.M128) (dst x86.M128i)

CvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm_cvttps_epu64'. Requires AVX512DQ.

func FpclassPdMask ¶

func FpclassPdMask(a x86.M128d, imm8 byte) (dst x86.Mmask8)

FpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 1
			i := j*64
			k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
		ENDFOR
		k[MAX:2] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func FpclassPsMask ¶

func FpclassPsMask(a x86.M128, imm8 byte) (dst x86.Mmask8)

FpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 3
			i := j*32
			k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
		ENDFOR
		k[MAX:4] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func FpclassSdMask ¶

func FpclassSdMask(a x86.M128d, imm8 byte) (dst x86.Mmask8)

FpclassSdMask: Test the lower double-precision (64-bit) floating-point element in 'a' for special categories specified by 'imm8', and store the result in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
		k[MAX:1] := 0

Instruction: 'VFPCLASSSD'. Intrinsic: '_mm_fpclass_sd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func FpclassSsMask ¶

func FpclassSsMask(a x86.M128, imm8 byte) (dst x86.Mmask8)

FpclassSsMask: Test the lower single-precision (32-bit) floating-point element in 'a' for special categories specified by 'imm8', and store the result in mask vector 'k.

	'imm" can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
		k[MAX:1] := 0

Instruction: 'VFPCLASSSS'. Intrinsic: '_mm_fpclass_ss_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256BroadcastF32x2 ¶

func M256BroadcastF32x2(a x86.M128) (dst x86.M256)

M256BroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm256_broadcast_f32x2'. Requires AVX512DQ.

func M256BroadcastF64x2 ¶

func M256BroadcastF64x2(a x86.M128d) (dst x86.M256d)

M256BroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm256_broadcast_f64x2'. Requires AVX512DQ.

func M256BroadcastI32x2 ¶

func M256BroadcastI32x2(a x86.M128i) (dst x86.M256i)

M256BroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of "dst.

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm256_broadcast_i32x2'. Requires AVX512DQ.

func M256BroadcastI64x2 ¶

func M256BroadcastI64x2(a x86.M128i) (dst x86.M256i)

M256BroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm256_broadcast_i64x2'. Requires AVX512DQ.

func M256Cvtepi64Pd ¶

func M256Cvtepi64Pd(a x86.M256i) (dst x86.M256d)

M256Cvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm256_cvtepi64_pd'. Requires AVX512DQ.

func M256Cvtepi64Ps ¶

func M256Cvtepi64Ps(a x86.M256i) (dst x86.M128)

M256Cvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm256_cvtepi64_ps'. Requires AVX512DQ.

func M256Cvtepu64Pd ¶

func M256Cvtepu64Pd(a x86.M256i) (dst x86.M256d)

M256Cvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm256_cvtepu64_pd'. Requires AVX512DQ.

func M256Cvtepu64Ps ¶

func M256Cvtepu64Ps(a x86.M256i) (dst x86.M128)

M256Cvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm256_cvtepu64_ps'. Requires AVX512DQ.

func M256CvtpdEpi64 ¶

func M256CvtpdEpi64(a x86.M256d) (dst x86.M256i)

M256CvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm256_cvtpd_epi64'. Requires AVX512DQ.

func M256CvtpdEpu64 ¶

func M256CvtpdEpu64(a x86.M256d) (dst x86.M256i)

M256CvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm256_cvtpd_epu64'. Requires AVX512DQ.

func M256CvtpsEpi64 ¶

func M256CvtpsEpi64(a x86.M128) (dst x86.M256i)

M256CvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm256_cvtps_epi64'. Requires AVX512DQ.

func M256CvtpsEpu64 ¶

func M256CvtpsEpu64(a x86.M128) (dst x86.M256i)

M256CvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm256_cvtps_epu64'. Requires AVX512DQ.

func M256CvttpdEpi64 ¶

func M256CvttpdEpi64(a x86.M256d) (dst x86.M256i)

M256CvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm256_cvttpd_epi64'. Requires AVX512DQ.

func M256CvttpdEpu64 ¶

func M256CvttpdEpu64(a x86.M256d) (dst x86.M256i)

M256CvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm256_cvttpd_epu64'. Requires AVX512DQ.

func M256CvttpsEpi64 ¶

func M256CvttpsEpi64(a x86.M128) (dst x86.M256i)

M256CvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm256_cvttps_epi64'. Requires AVX512DQ.

func M256CvttpsEpu64 ¶

func M256CvttpsEpu64(a x86.M128) (dst x86.M256i)

M256CvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm256_cvttps_epu64'. Requires AVX512DQ.

func M256Extractf64x2Pd ¶

func M256Extractf64x2Pd(a x86.M256d, imm8 byte) (dst x86.M128d)

M256Extractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm256_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256Extracti64x2Epi64 ¶

func M256Extracti64x2Epi64(a x86.M256i, imm8 byte) (dst x86.M128i)

M256Extracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm256_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256FpclassPdMask ¶

func M256FpclassPdMask(a x86.M256d, imm8 byte) (dst x86.Mmask8)

M256FpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 3
			i := j*64
			k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
		ENDFOR
		k[MAX:4] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm256_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256FpclassPsMask ¶

func M256FpclassPsMask(a x86.M256, imm8 byte) (dst x86.Mmask8)

M256FpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 7
			i := j*32
			k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
		ENDFOR
		k[MAX:8] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm256_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256Insertf64x2 ¶

func M256Insertf64x2(a x86.M256d, b x86.M128d, imm8 byte) (dst x86.M256d)

M256Insertf64x2: Copy 'a' to 'dst', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[255:0] := a[255:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm256_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256Inserti64x2 ¶

func M256Inserti64x2(a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256Inserti64x2: Copy 'a' to 'dst', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[255:0] := a[255:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm256_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskAndPd ¶

func M256MaskAndPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm256_mask_and_pd'. Requires AVX512DQ.

func M256MaskAndPs ¶

func M256MaskAndPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm256_mask_and_ps'. Requires AVX512DQ.

func M256MaskAndnotPd ¶

func M256MaskAndnotPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm256_mask_andnot_pd'. Requires AVX512DQ.

func M256MaskAndnotPs ¶

func M256MaskAndnotPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm256_mask_andnot_ps'. Requires AVX512DQ.

func M256MaskBroadcastF32x2 ¶

func M256MaskBroadcastF32x2(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskBroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm256_mask_broadcast_f32x2'. Requires AVX512DQ.

func M256MaskBroadcastF64x2 ¶

func M256MaskBroadcastF64x2(src x86.M256d, k x86.Mmask8, a x86.M128d) (dst x86.M256d)

M256MaskBroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm256_mask_broadcast_f64x2'. Requires AVX512DQ.

func M256MaskBroadcastI32x2 ¶

func M256MaskBroadcastI32x2(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm256_mask_broadcast_i32x2'. Requires AVX512DQ.

func M256MaskBroadcastI64x2 ¶

func M256MaskBroadcastI64x2(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm256_mask_broadcast_i64x2'. Requires AVX512DQ.

func M256MaskCvtepi64Pd ¶

func M256MaskCvtepi64Pd(src x86.M256d, k x86.Mmask8, a x86.M256i) (dst x86.M256d)

M256MaskCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm256_mask_cvtepi64_pd'. Requires AVX512DQ.

func M256MaskCvtepi64Ps ¶

func M256MaskCvtepi64Ps(src x86.M128, k x86.Mmask8, a x86.M256i) (dst x86.M128)

M256MaskCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm256_mask_cvtepi64_ps'. Requires AVX512DQ.

func M256MaskCvtepu64Pd ¶

func M256MaskCvtepu64Pd(src x86.M256d, k x86.Mmask8, a x86.M256i) (dst x86.M256d)

M256MaskCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm256_mask_cvtepu64_pd'. Requires AVX512DQ.

func M256MaskCvtepu64Ps ¶

func M256MaskCvtepu64Ps(src x86.M128, k x86.Mmask8, a x86.M256i) (dst x86.M128)

M256MaskCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm256_mask_cvtepu64_ps'. Requires AVX512DQ.

func M256MaskCvtpdEpi64 ¶

func M256MaskCvtpdEpi64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm256_mask_cvtpd_epi64'. Requires AVX512DQ.

func M256MaskCvtpdEpu64 ¶

func M256MaskCvtpdEpu64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm256_mask_cvtpd_epu64'. Requires AVX512DQ.

func M256MaskCvtpsEpi64 ¶

func M256MaskCvtpsEpi64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm256_mask_cvtps_epi64'. Requires AVX512DQ.

func M256MaskCvtpsEpu64 ¶

func M256MaskCvtpsEpu64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm256_mask_cvtps_epu64'. Requires AVX512DQ.

func M256MaskCvttpdEpi64 ¶

func M256MaskCvttpdEpi64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm256_mask_cvttpd_epi64'. Requires AVX512DQ.

func M256MaskCvttpdEpu64 ¶

func M256MaskCvttpdEpu64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm256_mask_cvttpd_epu64'. Requires AVX512DQ.

func M256MaskCvttpsEpi64 ¶

func M256MaskCvttpsEpi64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm256_mask_cvttps_epi64'. Requires AVX512DQ.

func M256MaskCvttpsEpu64 ¶

func M256MaskCvttpsEpu64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm256_mask_cvttps_epu64'. Requires AVX512DQ.

func M256MaskExtractf64x2Pd ¶

func M256MaskExtractf64x2Pd(src x86.M128d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M128d)

M256MaskExtractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm256_mask_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskExtracti64x2Epi64 ¶

func M256MaskExtracti64x2Epi64(src x86.M128i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)

M256MaskExtracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm256_mask_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskFpclassPdMask ¶

func M256MaskFpclassPdMask(k1 x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.Mmask8)

M256MaskFpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 3
			i := j*64
			IF k1[j]
				k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:4] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm256_mask_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskFpclassPsMask ¶

func M256MaskFpclassPsMask(k1 x86.Mmask8, a x86.M256, imm8 byte) (dst x86.Mmask8)

M256MaskFpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 7
			i := j*32
			IF k1[j]
				k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:8] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm256_mask_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskInsertf64x2 ¶

func M256MaskInsertf64x2(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M128d, imm8 byte) (dst x86.M256d)

M256MaskInsertf64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm256_mask_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskInserti64x2 ¶

func M256MaskInserti64x2(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256MaskInserti64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm256_mask_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskMulloEpi64 ¶

func M256MaskMulloEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm256_mask_mullo_epi64'. Requires AVX512DQ.

func M256MaskOrPd ¶

func M256MaskOrPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VORPD'. Intrinsic: '_mm256_mask_or_pd'. Requires AVX512DQ.

func M256MaskOrPs ¶

func M256MaskOrPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VORPS'. Intrinsic: '_mm256_mask_or_ps'. Requires AVX512DQ.

func M256MaskRangePd ¶

func M256MaskRangePd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm256_mask_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskRangePs ¶

func M256MaskRangePs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm256_mask_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskReducePd ¶

func M256MaskReducePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm256_mask_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskReducePs ¶

func M256MaskReducePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm256_mask_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskXorPd ¶

func M256MaskXorPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm256_mask_xor_pd'. Requires AVX512DQ.

func M256MaskXorPs ¶

func M256MaskXorPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm256_mask_xor_ps'. Requires AVX512DQ.

func M256MaskzAndPd ¶

func M256MaskzAndPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm256_maskz_and_pd'. Requires AVX512DQ.

func M256MaskzAndPs ¶

func M256MaskzAndPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm256_maskz_and_ps'. Requires AVX512DQ.

func M256MaskzAndnotPd ¶

func M256MaskzAndnotPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm256_maskz_andnot_pd'. Requires AVX512DQ.

func M256MaskzAndnotPs ¶

func M256MaskzAndnotPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm256_maskz_andnot_ps'. Requires AVX512DQ.

func M256MaskzBroadcastF32x2 ¶

func M256MaskzBroadcastF32x2(k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskzBroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm256_maskz_broadcast_f32x2'. Requires AVX512DQ.

func M256MaskzBroadcastF64x2 ¶

func M256MaskzBroadcastF64x2(k x86.Mmask8, a x86.M128d) (dst x86.M256d)

M256MaskzBroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm256_maskz_broadcast_f64x2'. Requires AVX512DQ.

func M256MaskzBroadcastI32x2 ¶

func M256MaskzBroadcastI32x2(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm256_maskz_broadcast_i32x2'. Requires AVX512DQ.

func M256MaskzBroadcastI64x2 ¶

func M256MaskzBroadcastI64x2(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm256_maskz_broadcast_i64x2'. Requires AVX512DQ.

func M256MaskzCvtepi64Pd ¶

func M256MaskzCvtepi64Pd(k x86.Mmask8, a x86.M256i) (dst x86.M256d)

M256MaskzCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm256_maskz_cvtepi64_pd'. Requires AVX512DQ.

func M256MaskzCvtepi64Ps ¶

func M256MaskzCvtepi64Ps(k x86.Mmask8, a x86.M256i) (dst x86.M128)

M256MaskzCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm256_maskz_cvtepi64_ps'. Requires AVX512DQ.

func M256MaskzCvtepu64Pd ¶

func M256MaskzCvtepu64Pd(k x86.Mmask8, a x86.M256i) (dst x86.M256d)

M256MaskzCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm256_maskz_cvtepu64_pd'. Requires AVX512DQ.

func M256MaskzCvtepu64Ps ¶

func M256MaskzCvtepu64Ps(k x86.Mmask8, a x86.M256i) (dst x86.M128)

M256MaskzCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm256_maskz_cvtepu64_ps'. Requires AVX512DQ.

func M256MaskzCvtpdEpi64 ¶

func M256MaskzCvtpdEpi64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskzCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm256_maskz_cvtpd_epi64'. Requires AVX512DQ.

func M256MaskzCvtpdEpu64 ¶

func M256MaskzCvtpdEpu64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskzCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm256_maskz_cvtpd_epu64'. Requires AVX512DQ.

func M256MaskzCvtpsEpi64 ¶

func M256MaskzCvtpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskzCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm256_maskz_cvtps_epi64'. Requires AVX512DQ.

func M256MaskzCvtpsEpu64 ¶

func M256MaskzCvtpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskzCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm256_maskz_cvtps_epu64'. Requires AVX512DQ.

func M256MaskzCvttpdEpi64 ¶

func M256MaskzCvttpdEpi64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskzCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm256_maskz_cvttpd_epi64'. Requires AVX512DQ.

func M256MaskzCvttpdEpu64 ¶

func M256MaskzCvttpdEpu64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskzCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm256_maskz_cvttpd_epu64'. Requires AVX512DQ.

func M256MaskzCvttpsEpi64 ¶

func M256MaskzCvttpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskzCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm256_maskz_cvttps_epi64'. Requires AVX512DQ.

func M256MaskzCvttpsEpu64 ¶

func M256MaskzCvttpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskzCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm256_maskz_cvttps_epu64'. Requires AVX512DQ.

func M256MaskzExtractf64x2Pd ¶

func M256MaskzExtractf64x2Pd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M128d)

M256MaskzExtractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm256_maskz_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzExtracti64x2Epi64 ¶

func M256MaskzExtracti64x2Epi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)

M256MaskzExtracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm256_maskz_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzInsertf64x2 ¶

func M256MaskzInsertf64x2(k x86.Mmask8, a x86.M256d, b x86.M128d, imm8 byte) (dst x86.M256d)

M256MaskzInsertf64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm256_maskz_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzInserti64x2 ¶

func M256MaskzInserti64x2(k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256MaskzInserti64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm256_maskz_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzMulloEpi64 ¶

func M256MaskzMulloEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm256_maskz_mullo_epi64'. Requires AVX512DQ.

func M256MaskzOrPd ¶

func M256MaskzOrPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VORPD'. Intrinsic: '_mm256_maskz_or_pd'. Requires AVX512DQ.

func M256MaskzOrPs ¶

func M256MaskzOrPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VORPS'. Intrinsic: '_mm256_maskz_or_ps'. Requires AVX512DQ.

func M256MaskzRangePd ¶

func M256MaskzRangePd(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm256_maskz_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzRangePs ¶

func M256MaskzRangePs(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskzRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm256_maskz_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzReducePd ¶

func M256MaskzReducePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm256_maskz_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzReducePs ¶

func M256MaskzReducePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskzReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm256_maskz_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzXorPd ¶

func M256MaskzXorPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm256_maskz_xor_pd'. Requires AVX512DQ.

func M256MaskzXorPs ¶

func M256MaskzXorPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm256_maskz_xor_ps'. Requires AVX512DQ.

func M256Movepi32Mask ¶

func M256Movepi32Mask(a x86.M256i) (dst x86.Mmask8)

M256Movepi32Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 32-bit integer in 'a'.

FOR j := 0 to 7
	i := j*32
	IF a[i+31]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPMOVD2M'. Intrinsic: '_mm256_movepi32_mask'. Requires AVX512DQ.

func M256Movepi64Mask ¶

func M256Movepi64Mask(a x86.M256i) (dst x86.Mmask8)

M256Movepi64Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 64-bit integer in 'a'.

FOR j := 0 to 3
	i := j*64
	IF a[i+63]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPMOVQ2M'. Intrinsic: '_mm256_movepi64_mask'. Requires AVX512DQ.

func M256MovmEpi32 ¶

func M256MovmEpi32(k x86.Mmask8) (dst x86.M256i)

M256MovmEpi32: Set each packed 32-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := 0xFFFFFFFF
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVM2D'. Intrinsic: '_mm256_movm_epi32'. Requires AVX512DQ.

func M256MovmEpi64 ¶

func M256MovmEpi64(k x86.Mmask8) (dst x86.M256i)

M256MovmEpi64: Set each packed 64-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := 0xFFFFFFFFffffffff
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVM2Q'. Intrinsic: '_mm256_movm_epi64'. Requires AVX512DQ.

func M256MulloEpi64 ¶

func M256MulloEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst'.

FOR j := 0 to 3
	i := j*64
	tmp[127:0] := a[i+63:i] * b[i+63:i]
	dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm256_mullo_epi64'. Requires AVX512DQ.

func M256RangePd ¶

func M256RangePd(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256RangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm256_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256RangePs ¶

func M256RangePs(a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256RangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm256_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256ReducePd ¶

func M256ReducePd(a x86.M256d, imm8 byte) (dst x86.M256d)

M256ReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm256_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256ReducePs ¶

func M256ReducePs(a x86.M256, imm8 byte) (dst x86.M256)

M256ReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm256_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512AndPd ¶

func M512AndPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512AndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm512_and_pd'. Requires AVX512DQ.

func M512AndPs ¶

func M512AndPs(a x86.M512, b x86.M512) (dst x86.M512)

M512AndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm512_and_ps'. Requires AVX512DQ.

func M512AndnotPd ¶

func M512AndnotPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512AndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm512_andnot_pd'. Requires AVX512DQ.

func M512AndnotPs ¶

func M512AndnotPs(a x86.M512, b x86.M512) (dst x86.M512)

M512AndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm512_andnot_ps'. Requires AVX512DQ.

func M512BroadcastF32x2 ¶

func M512BroadcastF32x2(a x86.M128) (dst x86.M512)

M512BroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm512_broadcast_f32x2'. Requires AVX512DQ.

func M512BroadcastF32x8 ¶

func M512BroadcastF32x8(a x86.M256) (dst x86.M512)

M512BroadcastF32x8: Broadcast the 8 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X8'. Intrinsic: '_mm512_broadcast_f32x8'. Requires AVX512DQ.

func M512BroadcastF64x2 ¶

func M512BroadcastF64x2(a x86.M128d) (dst x86.M512d)

M512BroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm512_broadcast_f64x2'. Requires AVX512DQ.

func M512BroadcastI32x2 ¶

func M512BroadcastI32x2(a x86.M128i) (dst x86.M512i)

M512BroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of "dst.

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm512_broadcast_i32x2'. Requires AVX512DQ.

func M512BroadcastI32x8 ¶

func M512BroadcastI32x8(a x86.M256i) (dst x86.M512i)

M512BroadcastI32x8: Broadcast the 8 packed 32-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X8'. Intrinsic: '_mm512_broadcast_i32x8'. Requires AVX512DQ.

func M512BroadcastI64x2 ¶

func M512BroadcastI64x2(a x86.M128i) (dst x86.M512i)

M512BroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm512_broadcast_i64x2'. Requires AVX512DQ.

func M512CvtRoundepi64Pd ¶

func M512CvtRoundepi64Pd(a x86.M512i, rounding int) (dst x86.M512d)

M512CvtRoundepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_cvt_roundepi64_pd'. Requires AVX512DQ.

func M512CvtRoundepi64Ps ¶

func M512CvtRoundepi64Ps(a x86.M512i, rounding int) (dst x86.M256)

M512CvtRoundepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_cvt_roundepi64_ps'. Requires AVX512DQ.

func M512CvtRoundepu64Pd ¶

func M512CvtRoundepu64Pd(a x86.M512i, rounding int) (dst x86.M512d)

M512CvtRoundepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_cvt_roundepu64_pd'. Requires AVX512DQ.

func M512CvtRoundepu64Ps ¶

func M512CvtRoundepu64Ps(a x86.M512i, rounding int) (dst x86.M256)

M512CvtRoundepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_cvt_roundepu64_ps'. Requires AVX512DQ.

func M512CvtRoundpdEpi64 ¶

func M512CvtRoundpdEpi64(a x86.M512d, rounding int) (dst x86.M512i)

M512CvtRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_cvt_roundpd_epi64'. Requires AVX512DQ.

func M512CvtRoundpdEpu64 ¶

func M512CvtRoundpdEpu64(a x86.M512d, rounding int) (dst x86.M512i)

M512CvtRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_cvt_roundpd_epu64'. Requires AVX512DQ.

func M512CvtRoundpsEpi64 ¶

func M512CvtRoundpsEpi64(a x86.M256, rounding int) (dst x86.M512i)

M512CvtRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_cvt_roundps_epi64'. Requires AVX512DQ.

func M512CvtRoundpsEpu64 ¶

func M512CvtRoundpsEpu64(a x86.M256, rounding int) (dst x86.M512i)

M512CvtRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_cvt_roundps_epu64'. Requires AVX512DQ.

func M512Cvtepi64Pd ¶

func M512Cvtepi64Pd(a x86.M512i) (dst x86.M512d)

M512Cvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_cvtepi64_pd'. Requires AVX512DQ.

func M512Cvtepi64Ps ¶

func M512Cvtepi64Ps(a x86.M512i) (dst x86.M256)

M512Cvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_cvtepi64_ps'. Requires AVX512DQ.

func M512Cvtepu64Pd ¶

func M512Cvtepu64Pd(a x86.M512i) (dst x86.M512d)

M512Cvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_cvtepu64_pd'. Requires AVX512DQ.

func M512Cvtepu64Ps ¶

func M512Cvtepu64Ps(a x86.M512i) (dst x86.M256)

M512Cvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_cvtepu64_ps'. Requires AVX512DQ.

func M512CvtpdEpi64 ¶

func M512CvtpdEpi64(a x86.M512d) (dst x86.M512i)

M512CvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_cvtpd_epi64'. Requires AVX512DQ.

func M512CvtpdEpu64 ¶

func M512CvtpdEpu64(a x86.M512d) (dst x86.M512i)

M512CvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_cvtpd_epu64'. Requires AVX512DQ.

func M512CvtpsEpi64 ¶

func M512CvtpsEpi64(a x86.M256) (dst x86.M512i)

M512CvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_cvtps_epi64'. Requires AVX512DQ.

func M512CvtpsEpu64 ¶

func M512CvtpsEpu64(a x86.M256) (dst x86.M512i)

M512CvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_cvtps_epu64'. Requires AVX512DQ.

func M512CvttRoundpdEpi64 ¶

func M512CvttRoundpdEpi64(a x86.M512d, sae int) (dst x86.M512i)

M512CvttRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'. Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_cvtt_roundpd_epi64'. Requires AVX512DQ.

func M512CvttRoundpdEpu64 ¶

func M512CvttRoundpdEpu64(a x86.M512d, sae int) (dst x86.M512i)

M512CvttRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'. Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_cvtt_roundpd_epu64'. Requires AVX512DQ.

func M512CvttRoundpsEpi64 ¶

func M512CvttRoundpsEpi64(a x86.M256, sae int) (dst x86.M512i)

M512CvttRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'. Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_cvtt_roundps_epi64'. Requires AVX512DQ.

func M512CvttRoundpsEpu64 ¶

func M512CvttRoundpsEpu64(a x86.M256, sae int) (dst x86.M512i)

M512CvttRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'. Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_cvtt_roundps_epu64'. Requires AVX512DQ.

func M512CvttpdEpi64 ¶

func M512CvttpdEpi64(a x86.M512d) (dst x86.M512i)

M512CvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_cvttpd_epi64'. Requires AVX512DQ.

func M512CvttpdEpu64 ¶

func M512CvttpdEpu64(a x86.M512d) (dst x86.M512i)

M512CvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_cvttpd_epu64'. Requires AVX512DQ.

func M512CvttpsEpi64 ¶

func M512CvttpsEpi64(a x86.M256) (dst x86.M512i)

M512CvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_cvttps_epi64'. Requires AVX512DQ.

func M512CvttpsEpu64 ¶

func M512CvttpsEpu64(a x86.M256) (dst x86.M512i)

M512CvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_cvttps_epu64'. Requires AVX512DQ.

func M512Extractf32x8Ps ¶

func M512Extractf32x8Ps(a x86.M512, imm8 byte) (dst x86.M256)

M512Extractf32x8Ps: Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0

Instruction: 'VEXTRACTF32X8'. Intrinsic: '_mm512_extractf32x8_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Extractf64x2Pd ¶

func M512Extractf64x2Pd(a x86.M512d, imm8 byte) (dst x86.M128d)

M512Extractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm512_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Extracti32x8Epi32 ¶

func M512Extracti32x8Epi32(a x86.M512i, imm8 byte) (dst x86.M256i)

M512Extracti32x8Epi32: Extract 256 bits (composed of 8 packed 32-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0

Instruction: 'VEXTRACTI32X8'. Intrinsic: '_mm512_extracti32x8_epi32'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Extracti64x2Epi64 ¶

func M512Extracti64x2Epi64(a x86.M512i, imm8 byte) (dst x86.M128i)

M512Extracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm512_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512FpclassPdMask ¶

func M512FpclassPdMask(a x86.M512d, imm8 byte) (dst x86.Mmask8)

M512FpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 7
			i := j*64
			k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
		ENDFOR
		k[MAX:8] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm512_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512FpclassPsMask ¶

func M512FpclassPsMask(a x86.M512, imm8 byte) (dst x86.Mmask16)

M512FpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 15
			i := j*32
			k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
		ENDFOR
		k[MAX:16] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm512_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Insertf32x8 ¶

func M512Insertf32x8(a x86.M512, b x86.M256, imm8 byte) (dst x86.M512)

M512Insertf32x8: Copy 'a' to 'dst', then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTF32X8'. Intrinsic: '_mm512_insertf32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Insertf64x2 ¶

func M512Insertf64x2(a x86.M512d, b x86.M128d, imm8 byte) (dst x86.M512d)

M512Insertf64x2: Copy 'a' to 'dst', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm512_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Inserti32x8 ¶

func M512Inserti32x8(a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512Inserti32x8: Copy 'a' to 'dst', then insert 256 bits (composed of 8 packed 32-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE imm8[7:0] of
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTI32X8'. Intrinsic: '_mm512_inserti32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Inserti64x2 ¶

func M512Inserti64x2(a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512Inserti64x2: Copy 'a' to 'dst', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm512_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskAndPd ¶

func M512MaskAndPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm512_mask_and_pd'. Requires AVX512DQ.

func M512MaskAndPs ¶

func M512MaskAndPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm512_mask_and_ps'. Requires AVX512DQ.

func M512MaskAndnotPd ¶

func M512MaskAndnotPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm512_mask_andnot_pd'. Requires AVX512DQ.

func M512MaskAndnotPs ¶

func M512MaskAndnotPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm512_mask_andnot_ps'. Requires AVX512DQ.

func M512MaskBroadcastF32x2 ¶

func M512MaskBroadcastF32x2(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskBroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm512_mask_broadcast_f32x2'. Requires AVX512DQ.

func M512MaskBroadcastF32x8 ¶

func M512MaskBroadcastF32x8(src x86.M512, k x86.Mmask16, a x86.M256) (dst x86.M512)

M512MaskBroadcastF32x8: Broadcast the 8 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X8'. Intrinsic: '_mm512_mask_broadcast_f32x8'. Requires AVX512DQ.

func M512MaskBroadcastF64x2 ¶

func M512MaskBroadcastF64x2(src x86.M512d, k x86.Mmask8, a x86.M128d) (dst x86.M512d)

M512MaskBroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm512_mask_broadcast_f64x2'. Requires AVX512DQ.

func M512MaskBroadcastI32x2 ¶

func M512MaskBroadcastI32x2(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm512_mask_broadcast_i32x2'. Requires AVX512DQ.

func M512MaskBroadcastI32x8 ¶

func M512MaskBroadcastI32x8(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskBroadcastI32x8: Broadcast the 8 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X8'. Intrinsic: '_mm512_mask_broadcast_i32x8'. Requires AVX512DQ.

func M512MaskBroadcastI64x2 ¶

func M512MaskBroadcastI64x2(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm512_mask_broadcast_i64x2'. Requires AVX512DQ.

func M512MaskCvtRoundepi64Pd ¶

func M512MaskCvtRoundepi64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)

M512MaskCvtRoundepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_mask_cvt_roundepi64_pd'. Requires AVX512DQ.

func M512MaskCvtRoundepi64Ps ¶

func M512MaskCvtRoundepi64Ps(src x86.M256, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)

M512MaskCvtRoundepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
			ELSE
				dst[l+31:l] := src[l+31:l]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_mask_cvt_roundepi64_ps'. Requires AVX512DQ.

func M512MaskCvtRoundepu64Pd ¶

func M512MaskCvtRoundepu64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)

M512MaskCvtRoundepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_mask_cvt_roundepu64_pd'. Requires AVX512DQ.

func M512MaskCvtRoundepu64Ps ¶

func M512MaskCvtRoundepu64Ps(src x86.M256, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)

M512MaskCvtRoundepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
			ELSE
				dst[l+31:l] := src[l+31:l]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_mask_cvt_roundepu64_ps'. Requires AVX512DQ.

func M512MaskCvtRoundpdEpi64 ¶

func M512MaskCvtRoundpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)

M512MaskCvtRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_mask_cvt_roundpd_epi64'. Requires AVX512DQ.

func M512MaskCvtRoundpdEpu64 ¶

func M512MaskCvtRoundpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)

M512MaskCvtRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_mask_cvt_roundpd_epu64'. Requires AVX512DQ.

func M512MaskCvtRoundpsEpi64 ¶

func M512MaskCvtRoundpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)

M512MaskCvtRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_mask_cvt_roundps_epi64'. Requires AVX512DQ.

func M512MaskCvtRoundpsEpu64 ¶

func M512MaskCvtRoundpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)

M512MaskCvtRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_mask_cvt_roundps_epu64'. Requires AVX512DQ.

func M512MaskCvtepi64Pd ¶

func M512MaskCvtepi64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i) (dst x86.M512d)

M512MaskCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_mask_cvtepi64_pd'. Requires AVX512DQ.

func M512MaskCvtepi64Ps ¶

func M512MaskCvtepi64Ps(src x86.M256, k x86.Mmask8, a x86.M512i) (dst x86.M256)

M512MaskCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_mask_cvtepi64_ps'. Requires AVX512DQ.

func M512MaskCvtepu64Pd ¶

func M512MaskCvtepu64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i) (dst x86.M512d)

M512MaskCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_mask_cvtepu64_pd'. Requires AVX512DQ.

func M512MaskCvtepu64Ps ¶

func M512MaskCvtepu64Ps(src x86.M256, k x86.Mmask8, a x86.M512i) (dst x86.M256)

M512MaskCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_mask_cvtepu64_ps'. Requires AVX512DQ.

func M512MaskCvtpdEpi64 ¶

func M512MaskCvtpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_mask_cvtpd_epi64'. Requires AVX512DQ.

func M512MaskCvtpdEpu64 ¶

func M512MaskCvtpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_mask_cvtpd_epu64'. Requires AVX512DQ.

func M512MaskCvtpsEpi64 ¶

func M512MaskCvtpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_mask_cvtps_epi64'. Requires AVX512DQ.

func M512MaskCvtpsEpu64 ¶

func M512MaskCvtpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_mask_cvtps_epu64'. Requires AVX512DQ.

func M512MaskCvttRoundpdEpi64 ¶

func M512MaskCvttRoundpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)

M512MaskCvttRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_mask_cvtt_roundpd_epi64'. Requires AVX512DQ.

func M512MaskCvttRoundpdEpu64 ¶

func M512MaskCvttRoundpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)

M512MaskCvttRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_mask_cvtt_roundpd_epu64'. Requires AVX512DQ.

func M512MaskCvttRoundpsEpi64 ¶

func M512MaskCvttRoundpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)

M512MaskCvttRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		l := j*32
		IF k[j]
			dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_mask_cvtt_roundps_epi64'. Requires AVX512DQ.

func M512MaskCvttRoundpsEpu64 ¶

func M512MaskCvttRoundpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)

M512MaskCvttRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		l := j*32
		IF k[j]
			dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_mask_cvtt_roundps_epu64'. Requires AVX512DQ.

func M512MaskCvttpdEpi64 ¶

func M512MaskCvttpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_mask_cvttpd_epi64'. Requires AVX512DQ.

func M512MaskCvttpdEpu64 ¶

func M512MaskCvttpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_mask_cvttpd_epu64'. Requires AVX512DQ.

func M512MaskCvttpsEpi64 ¶

func M512MaskCvttpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_mask_cvttps_epi64'. Requires AVX512DQ.

func M512MaskCvttpsEpu64 ¶

func M512MaskCvttpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_mask_cvttps_epu64'. Requires AVX512DQ.

func M512MaskExtractf32x8Ps ¶

func M512MaskExtractf32x8Ps(src x86.M256, k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M256)

M512MaskExtractf32x8Ps: Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTF32X8'. Intrinsic: '_mm512_mask_extractf32x8_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskExtractf64x2Pd ¶

func M512MaskExtractf64x2Pd(src x86.M128d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M128d)

M512MaskExtractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm512_mask_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskExtracti32x8Epi32 ¶

func M512MaskExtracti32x8Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)

M512MaskExtracti32x8Epi32: Extract 256 bits (composed of 8 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTI32X8'. Intrinsic: '_mm512_mask_extracti32x8_epi32'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskExtracti64x2Epi64 ¶

func M512MaskExtracti64x2Epi64(src x86.M128i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)

M512MaskExtracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm512_mask_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskFpclassPdMask ¶

func M512MaskFpclassPdMask(k1 x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.Mmask8)

M512MaskFpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 7
			i := j*64
			IF k1[j]
				k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:8] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm512_mask_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskFpclassPsMask ¶

func M512MaskFpclassPsMask(k1 x86.Mmask16, a x86.M512, imm8 byte) (dst x86.Mmask16)

M512MaskFpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 15
			i := j*32
			IF k1[j]
				k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:16] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm512_mask_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskInsertf32x8 ¶

func M512MaskInsertf32x8(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M256, imm8 byte) (dst x86.M512)

M512MaskInsertf32x8: Copy 'a' to 'tmp', then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF32X8'. Intrinsic: '_mm512_mask_insertf32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskInsertf64x2 ¶

func M512MaskInsertf64x2(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M128d, imm8 byte) (dst x86.M512d)

M512MaskInsertf64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm512_mask_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskInserti32x8 ¶

func M512MaskInserti32x8(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512MaskInserti32x8: Copy 'a' to 'tmp', then insert 256 bits (composed of 8 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI32X8'. Intrinsic: '_mm512_mask_inserti32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskInserti64x2 ¶

func M512MaskInserti64x2(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512MaskInserti64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm512_mask_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskMulloEpi64 ¶

func M512MaskMulloEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm512_mask_mullo_epi64'. Requires AVX512DQ.

func M512MaskOrPd ¶

func M512MaskOrPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPD'. Intrinsic: '_mm512_mask_or_pd'. Requires AVX512DQ.

func M512MaskOrPs ¶

func M512MaskOrPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPS'. Intrinsic: '_mm512_mask_or_ps'. Requires AVX512DQ.

func M512MaskRangePd ¶

func M512MaskRangePd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_mask_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskRangePs ¶

func M512MaskRangePs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_mask_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskRangeRoundPd ¶

func M512MaskRangeRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskRangeRoundPd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_mask_range_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskRangeRoundPs ¶

func M512MaskRangeRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskRangeRoundPs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[63:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_mask_range_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskReducePd ¶

func M512MaskReducePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_mask_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskReducePs ¶

func M512MaskReducePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_mask_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskReduceRoundPd ¶

func M512MaskReduceRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskReduceRoundPd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_mask_reduce_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskReduceRoundPs ¶

func M512MaskReduceRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskReduceRoundPs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}
		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_mask_reduce_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskXorPd ¶

func M512MaskXorPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm512_mask_xor_pd'. Requires AVX512DQ.

func M512MaskXorPs ¶

func M512MaskXorPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm512_mask_xor_ps'. Requires AVX512DQ.

func M512MaskzAndPd ¶

func M512MaskzAndPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm512_maskz_and_pd'. Requires AVX512DQ.

func M512MaskzAndPs ¶

func M512MaskzAndPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm512_maskz_and_ps'. Requires AVX512DQ.

func M512MaskzAndnotPd ¶

func M512MaskzAndnotPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm512_maskz_andnot_pd'. Requires AVX512DQ.

func M512MaskzAndnotPs ¶

func M512MaskzAndnotPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm512_maskz_andnot_ps'. Requires AVX512DQ.

func M512MaskzBroadcastF32x2 ¶

func M512MaskzBroadcastF32x2(k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskzBroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm512_maskz_broadcast_f32x2'. Requires AVX512DQ.

func M512MaskzBroadcastF32x8 ¶

func M512MaskzBroadcastF32x8(k x86.Mmask16, a x86.M256) (dst x86.M512)

M512MaskzBroadcastF32x8: Broadcast the 8 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X8'. Intrinsic: '_mm512_maskz_broadcast_f32x8'. Requires AVX512DQ.

func M512MaskzBroadcastF64x2 ¶

func M512MaskzBroadcastF64x2(k x86.Mmask8, a x86.M128d) (dst x86.M512d)

M512MaskzBroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm512_maskz_broadcast_f64x2'. Requires AVX512DQ.

func M512MaskzBroadcastI32x2 ¶

func M512MaskzBroadcastI32x2(k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm512_maskz_broadcast_i32x2'. Requires AVX512DQ.

func M512MaskzBroadcastI32x8 ¶

func M512MaskzBroadcastI32x8(k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskzBroadcastI32x8: Broadcast the 8 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X8'. Intrinsic: '_mm512_maskz_broadcast_i32x8'. Requires AVX512DQ.

func M512MaskzBroadcastI64x2 ¶

func M512MaskzBroadcastI64x2(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm512_maskz_broadcast_i64x2'. Requires AVX512DQ.

func M512MaskzCvtRoundepi64Pd ¶

func M512MaskzCvtRoundepi64Pd(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)

M512MaskzCvtRoundepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_maskz_cvt_roundepi64_pd'. Requires AVX512DQ.

func M512MaskzCvtRoundepi64Ps ¶

func M512MaskzCvtRoundepi64Ps(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)

M512MaskzCvtRoundepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
			ELSE
				dst[l+31:l] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_maskz_cvt_roundepi64_ps'. Requires AVX512DQ.

func M512MaskzCvtRoundepu64Pd ¶

func M512MaskzCvtRoundepu64Pd(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)

M512MaskzCvtRoundepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_maskz_cvt_roundepu64_pd'. Requires AVX512DQ.

func M512MaskzCvtRoundepu64Ps ¶

func M512MaskzCvtRoundepu64Ps(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)

M512MaskzCvtRoundepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
			ELSE
				dst[l+31:l] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_maskz_cvt_roundepu64_ps'. Requires AVX512DQ.

func M512MaskzCvtRoundpdEpi64 ¶

func M512MaskzCvtRoundpdEpi64(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_maskz_cvt_roundpd_epi64'. Requires AVX512DQ.

func M512MaskzCvtRoundpdEpu64 ¶

func M512MaskzCvtRoundpdEpu64(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_maskz_cvt_roundpd_epu64'. Requires AVX512DQ.

func M512MaskzCvtRoundpsEpi64 ¶

func M512MaskzCvtRoundpsEpi64(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_maskz_cvt_roundps_epi64'. Requires AVX512DQ.

func M512MaskzCvtRoundpsEpu64 ¶

func M512MaskzCvtRoundpsEpu64(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_maskz_cvt_roundps_epu64'. Requires AVX512DQ.

func M512MaskzCvtepi64Pd ¶

func M512MaskzCvtepi64Pd(k x86.Mmask8, a x86.M512i) (dst x86.M512d)

M512MaskzCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_maskz_cvtepi64_pd'. Requires AVX512DQ.

func M512MaskzCvtepi64Ps ¶

func M512MaskzCvtepi64Ps(k x86.Mmask8, a x86.M512i) (dst x86.M256)

M512MaskzCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_maskz_cvtepi64_ps'. Requires AVX512DQ.

func M512MaskzCvtepu64Pd ¶

func M512MaskzCvtepu64Pd(k x86.Mmask8, a x86.M512i) (dst x86.M512d)

M512MaskzCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_maskz_cvtepu64_pd'. Requires AVX512DQ.

func M512MaskzCvtepu64Ps ¶

func M512MaskzCvtepu64Ps(k x86.Mmask8, a x86.M512i) (dst x86.M256)

M512MaskzCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_maskz_cvtepu64_ps'. Requires AVX512DQ.

func M512MaskzCvtpdEpi64 ¶

func M512MaskzCvtpdEpi64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskzCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_maskz_cvtpd_epi64'. Requires AVX512DQ.

func M512MaskzCvtpdEpu64 ¶

func M512MaskzCvtpdEpu64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskzCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_maskz_cvtpd_epu64'. Requires AVX512DQ.

func M512MaskzCvtpsEpi64 ¶

func M512MaskzCvtpsEpi64(k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskzCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_maskz_cvtps_epi64'. Requires AVX512DQ.

func M512MaskzCvtpsEpu64 ¶

func M512MaskzCvtpsEpu64(k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskzCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_maskz_cvtps_epu64'. Requires AVX512DQ.

func M512MaskzCvttRoundpdEpi64 ¶

func M512MaskzCvttRoundpdEpi64(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)

M512MaskzCvttRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_maskz_cvtt_roundpd_epi64'. Requires AVX512DQ.

func M512MaskzCvttRoundpdEpu64 ¶

func M512MaskzCvttRoundpdEpu64(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)

M512MaskzCvttRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_maskz_cvtt_roundpd_epu64'. Requires AVX512DQ.

func M512MaskzCvttRoundpsEpi64 ¶

func M512MaskzCvttRoundpsEpi64(k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)

M512MaskzCvttRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_maskz_cvtt_roundps_epi64'. Requires AVX512DQ.

func M512MaskzCvttRoundpsEpu64 ¶

func M512MaskzCvttRoundpsEpu64(k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)

M512MaskzCvttRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_maskz_cvtt_roundps_epu64'. Requires AVX512DQ.

func M512MaskzCvttpdEpi64 ¶

func M512MaskzCvttpdEpi64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskzCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_maskz_cvttpd_epi64'. Requires AVX512DQ.

func M512MaskzCvttpdEpu64 ¶

func M512MaskzCvttpdEpu64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskzCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_maskz_cvttpd_epu64'. Requires AVX512DQ.

func M512MaskzCvttpsEpi64 ¶

func M512MaskzCvttpsEpi64(k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskzCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_maskz_cvttps_epi64'. Requires AVX512DQ.

func M512MaskzCvttpsEpu64 ¶

func M512MaskzCvttpsEpu64(k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskzCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_maskz_cvttps_epu64'. Requires AVX512DQ.

func M512MaskzExtractf32x8Ps ¶

func M512MaskzExtractf32x8Ps(k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M256)

M512MaskzExtractf32x8Ps: Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTF32X8'. Intrinsic: '_mm512_maskz_extractf32x8_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtractf64x2Pd ¶

func M512MaskzExtractf64x2Pd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M128d)

M512MaskzExtractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm512_maskz_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtracti32x8Epi32 ¶

func M512MaskzExtracti32x8Epi32(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)

M512MaskzExtracti32x8Epi32: Extract 256 bits (composed of 8 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTI32X8'. Intrinsic: '_mm512_maskz_extracti32x8_epi32'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtracti64x2Epi64 ¶

func M512MaskzExtracti64x2Epi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)

M512MaskzExtracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm512_maskz_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzInsertf32x8 ¶

func M512MaskzInsertf32x8(k x86.Mmask16, a x86.M512, b x86.M256, imm8 byte) (dst x86.M512)

M512MaskzInsertf32x8: Copy 'a' to 'tmp', then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF32X8'. Intrinsic: '_mm512_maskz_insertf32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzInsertf64x2 ¶

func M512MaskzInsertf64x2(k x86.Mmask8, a x86.M512d, b x86.M128d, imm8 byte) (dst x86.M512d)

M512MaskzInsertf64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm512_maskz_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzInserti32x8 ¶

func M512MaskzInserti32x8(k x86.Mmask16, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512MaskzInserti32x8: Copy 'a' to 'tmp', then insert 256 bits (composed of 8 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI32X8'. Intrinsic: '_mm512_maskz_inserti32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzInserti64x2 ¶

func M512MaskzInserti64x2(k x86.Mmask8, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512MaskzInserti64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm512_maskz_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzMulloEpi64 ¶

func M512MaskzMulloEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm512_maskz_mullo_epi64'. Requires AVX512DQ.

func M512MaskzOrPd ¶

func M512MaskzOrPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPD'. Intrinsic: '_mm512_maskz_or_pd'. Requires AVX512DQ.

func M512MaskzOrPs ¶

func M512MaskzOrPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPS'. Intrinsic: '_mm512_maskz_or_ps'. Requires AVX512DQ.

func M512MaskzRangePd ¶

func M512MaskzRangePd(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_maskz_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzRangePs ¶

func M512MaskzRangePs(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskzRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_maskz_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzRangeRoundPd ¶

func M512MaskzRangeRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskzRangeRoundPd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_maskz_range_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzRangeRoundPs ¶

func M512MaskzRangeRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskzRangeRoundPs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[63:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_maskz_range_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzReducePd ¶

func M512MaskzReducePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_maskz_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzReducePs ¶

func M512MaskzReducePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskzReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_maskz_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzReduceRoundPd ¶

func M512MaskzReduceRoundPd(k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskzReduceRoundPd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_maskz_reduce_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzReduceRoundPs ¶

func M512MaskzReduceRoundPs(k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskzReduceRoundPs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}
		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_maskz_reduce_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzXorPd ¶

func M512MaskzXorPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm512_maskz_xor_pd'. Requires AVX512DQ.

func M512MaskzXorPs ¶

func M512MaskzXorPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm512_maskz_xor_ps'. Requires AVX512DQ.

func M512Movepi32Mask ¶

func M512Movepi32Mask(a x86.M512i) (dst x86.Mmask16)

M512Movepi32Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 32-bit integer in 'a'.

FOR j := 0 to 15
	i := j*32
	IF a[i+31]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPMOVD2M'. Intrinsic: '_mm512_movepi32_mask'. Requires AVX512DQ.

func M512Movepi64Mask ¶

func M512Movepi64Mask(a x86.M512i) (dst x86.Mmask8)

M512Movepi64Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 64-bit integer in 'a'.

FOR j := 0 to 7
	i := j*64
	IF a[i+63]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPMOVQ2M'. Intrinsic: '_mm512_movepi64_mask'. Requires AVX512DQ.

func M512MovmEpi32 ¶

func M512MovmEpi32(k x86.Mmask16) (dst x86.M512i)

M512MovmEpi32: Set each packed 32-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 0xFFFFFFFF
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVM2D'. Intrinsic: '_mm512_movm_epi32'. Requires AVX512DQ.

func M512MovmEpi64 ¶

func M512MovmEpi64(k x86.Mmask8) (dst x86.M512i)

M512MovmEpi64: Set each packed 64-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 0xFFFFFFFFffffffff
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVM2Q'. Intrinsic: '_mm512_movm_epi64'. Requires AVX512DQ.

func M512MulloEpi64 ¶

func M512MulloEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst'.

FOR j := 0 to 7
	i := j*64
	tmp[127:0] := a[i+63:i] * b[i+63:i]
	dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm512_mullo_epi64'. Requires AVX512DQ.

func M512OrPd ¶

func M512OrPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512OrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPD'. Intrinsic: '_mm512_or_pd'. Requires AVX512DQ.

func M512OrPs ¶

func M512OrPs(a x86.M512, b x86.M512) (dst x86.M512)

M512OrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPS'. Intrinsic: '_mm512_or_ps'. Requires AVX512DQ.

func M512RangePd ¶

func M512RangePd(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512RangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512RangePs ¶

func M512RangePs(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512RangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512RangeRoundPd ¶

func M512RangeRoundPd(a x86.M512d, b x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512RangeRoundPd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_range_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512RangeRoundPs ¶

func M512RangeRoundPs(a x86.M512, b x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512RangeRoundPs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[63:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_range_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512ReducePd ¶

func M512ReducePd(a x86.M512d, imm8 byte) (dst x86.M512d)

M512ReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512ReducePs ¶

func M512ReducePs(a x86.M512, imm8 byte) (dst x86.M512)

M512ReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512ReduceRoundPd ¶

func M512ReduceRoundPd(a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512ReduceRoundPd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_reduce_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512ReduceRoundPs ¶

func M512ReduceRoundPs(a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512ReduceRoundPs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}
		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_reduce_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512XorPd ¶

func M512XorPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512XorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm512_xor_pd'. Requires AVX512DQ.

func M512XorPs ¶

func M512XorPs(a x86.M512, b x86.M512) (dst x86.M512)

M512XorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm512_xor_ps'. Requires AVX512DQ.

func MaskAndPd ¶

func MaskAndPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm_mask_and_pd'. Requires AVX512DQ.

func MaskAndPs ¶

func MaskAndPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm_mask_and_ps'. Requires AVX512DQ.

func MaskAndnotPd ¶

func MaskAndnotPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm_mask_andnot_pd'. Requires AVX512DQ.

func MaskAndnotPs ¶

func MaskAndnotPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm_mask_andnot_ps'. Requires AVX512DQ.

func MaskBroadcastI32x2 ¶

func MaskBroadcastI32x2(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm_mask_broadcast_i32x2'. Requires AVX512DQ.

func MaskCvtepi64Pd ¶

func MaskCvtepi64Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm_mask_cvtepi64_pd'. Requires AVX512DQ.

func MaskCvtepi64Ps ¶

func MaskCvtepi64Ps(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm_mask_cvtepi64_ps'. Requires AVX512DQ.

func MaskCvtepu64Pd ¶

func MaskCvtepu64Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm_mask_cvtepu64_pd'. Requires AVX512DQ.

func MaskCvtepu64Ps ¶

func MaskCvtepu64Ps(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm_mask_cvtepu64_ps'. Requires AVX512DQ.

func MaskCvtpdEpi64 ¶

func MaskCvtpdEpi64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm_mask_cvtpd_epi64'. Requires AVX512DQ.

func MaskCvtpdEpu64 ¶

func MaskCvtpdEpu64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm_mask_cvtpd_epu64'. Requires AVX512DQ.

func MaskCvtpsEpi64 ¶

func MaskCvtpsEpi64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm_mask_cvtps_epi64'. Requires AVX512DQ.

func MaskCvtpsEpu64 ¶

func MaskCvtpsEpu64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm_mask_cvtps_epu64'. Requires AVX512DQ.

func MaskCvttpdEpi64 ¶

func MaskCvttpdEpi64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm_mask_cvttpd_epi64'. Requires AVX512DQ.

func MaskCvttpdEpu64 ¶

func MaskCvttpdEpu64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm_mask_cvttpd_epu64'. Requires AVX512DQ.

func MaskCvttpsEpi64 ¶

func MaskCvttpsEpi64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm_mask_cvttps_epi64'. Requires AVX512DQ.

func MaskCvttpsEpu64 ¶

func MaskCvttpsEpu64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm_mask_cvttps_epu64'. Requires AVX512DQ.

func MaskFpclassPdMask ¶

func MaskFpclassPdMask(k1 x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.Mmask8)

MaskFpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 1
			i := j*64
			IF k1[j]
				k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:2] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm_mask_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskFpclassPsMask ¶

func MaskFpclassPsMask(k1 x86.Mmask8, a x86.M128, imm8 byte) (dst x86.Mmask8)

MaskFpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 3
			i := j*32
			IF k1[j]
				k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:4] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm_mask_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskFpclassSdMask ¶

func MaskFpclassSdMask(k1 x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.Mmask8)

MaskFpclassSdMask: Test the lower double-precision (64-bit) floating-point element in 'a' for special categories specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		IF k1[0]
			k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
		ELSE
			k[0] := 0
		FI
		k[MAX:1] := 0

Instruction: 'VFPCLASSSD'. Intrinsic: '_mm_mask_fpclass_sd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskFpclassSsMask ¶

func MaskFpclassSsMask(k1 x86.Mmask8, a x86.M128, imm8 byte) (dst x86.Mmask8)

MaskFpclassSsMask: Test the lower single-precision (32-bit) floating-point element in 'a' for special categories specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		IF k1[0]
			k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
		ELSE
			k[0] := 0
		FI
		k[MAX:1] := 0

Instruction: 'VFPCLASSSS'. Intrinsic: '_mm_mask_fpclass_ss_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskMulloEpi64 ¶

func MaskMulloEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm_mask_mullo_epi64'. Requires AVX512DQ.

func MaskOrPd ¶

func MaskOrPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VORPD'. Intrinsic: '_mm_mask_or_pd'. Requires AVX512DQ.

func MaskOrPs ¶

func MaskOrPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VORPS'. Intrinsic: '_mm_mask_or_ps'. Requires AVX512DQ.

func MaskRangePd ¶

func MaskRangePd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm_mask_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskRangePs ¶

func MaskRangePs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm_mask_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskRangeRoundSd ¶

func MaskRangeRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskRangeRoundSd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		IF k[0]
			dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VRANGESD'. Intrinsic: '_mm_mask_range_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskRangeRoundSs ¶

func MaskRangeRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskRangeRoundSs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[31:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		IF k[0]
			dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VRANGESS'. Intrinsic: '_mm_mask_range_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskRangeSd ¶

func MaskRangeSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskRangeSd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

IF k[0]
	dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRANGESD'. Intrinsic: '_mm_mask_range_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskRangeSs ¶

func MaskRangeSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskRangeSs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[31:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

IF k[0]
	dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRANGESS'. Intrinsic: '_mm_mask_range_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReducePd ¶

func MaskReducePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm_mask_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReducePs ¶

func MaskReducePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm_mask_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReduceRoundSd ¶

func MaskReduceRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskReduceRoundSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		IF k[0]
			dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_mask_reduce_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReduceRoundSs ¶

func MaskReduceRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskReduceRoundSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}

		IF k[0]
			dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:64] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_mask_reduce_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReduceSd ¶

func MaskReduceSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskReduceSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

IF k[0]
	dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_mask_reduce_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReduceSs ¶

func MaskReduceSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskReduceSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}

IF k[0]
	dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:64] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_mask_reduce_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskXorPd ¶

func MaskXorPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm_mask_xor_pd'. Requires AVX512DQ.

func MaskXorPs ¶

func MaskXorPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm_mask_xor_ps'. Requires AVX512DQ.

func MaskzAndPd ¶

func MaskzAndPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm_maskz_and_pd'. Requires AVX512DQ.

func MaskzAndPs ¶

func MaskzAndPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm_maskz_and_ps'. Requires AVX512DQ.

func MaskzAndnotPd ¶

func MaskzAndnotPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm_maskz_andnot_pd'. Requires AVX512DQ.

func MaskzAndnotPs ¶

func MaskzAndnotPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm_maskz_andnot_ps'. Requires AVX512DQ.

func MaskzBroadcastI32x2 ¶

func MaskzBroadcastI32x2(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm_maskz_broadcast_i32x2'. Requires AVX512DQ.

func MaskzCvtepi64Pd ¶

func MaskzCvtepi64Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskzCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm_maskz_cvtepi64_pd'. Requires AVX512DQ.

func MaskzCvtepi64Ps ¶

func MaskzCvtepi64Ps(k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskzCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm_maskz_cvtepi64_ps'. Requires AVX512DQ.

func MaskzCvtepu64Pd ¶

func MaskzCvtepu64Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskzCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm_maskz_cvtepu64_pd'. Requires AVX512DQ.

func MaskzCvtepu64Ps ¶

func MaskzCvtepu64Ps(k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskzCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm_maskz_cvtepu64_ps'. Requires AVX512DQ.

func MaskzCvtpdEpi64 ¶

func MaskzCvtpdEpi64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm_maskz_cvtpd_epi64'. Requires AVX512DQ.

func MaskzCvtpdEpu64 ¶

func MaskzCvtpdEpu64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm_maskz_cvtpd_epu64'. Requires AVX512DQ.

func MaskzCvtpsEpi64 ¶

func MaskzCvtpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm_maskz_cvtps_epi64'. Requires AVX512DQ.

func MaskzCvtpsEpu64 ¶

func MaskzCvtpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm_maskz_cvtps_epu64'. Requires AVX512DQ.

func MaskzCvttpdEpi64 ¶

func MaskzCvttpdEpi64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm_maskz_cvttpd_epi64'. Requires AVX512DQ.

func MaskzCvttpdEpu64 ¶

func MaskzCvttpdEpu64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm_maskz_cvttpd_epu64'. Requires AVX512DQ.

func MaskzCvttpsEpi64 ¶

func MaskzCvttpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm_maskz_cvttps_epi64'. Requires AVX512DQ.

func MaskzCvttpsEpu64 ¶

func MaskzCvttpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm_maskz_cvttps_epu64'. Requires AVX512DQ.

func MaskzMulloEpi64 ¶

func MaskzMulloEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm_maskz_mullo_epi64'. Requires AVX512DQ.

func MaskzOrPd ¶

func MaskzOrPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VORPD'. Intrinsic: '_mm_maskz_or_pd'. Requires AVX512DQ.

func MaskzOrPs ¶

func MaskzOrPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VORPS'. Intrinsic: '_mm_maskz_or_ps'. Requires AVX512DQ.

func MaskzRangePd ¶

func MaskzRangePd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskzRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm_maskz_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzRangePs ¶

func MaskzRangePs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskzRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm_maskz_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzRangeRoundSd ¶

func MaskzRangeRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskzRangeRoundSd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		IF k[0]
			dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VRANGESD'. Intrinsic: '_mm_maskz_range_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzRangeRoundSs ¶

func MaskzRangeRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskzRangeRoundSs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[31:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		IF k[0]
			dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VRANGESS'. Intrinsic: '_mm_maskz_range_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzRangeSd ¶

func MaskzRangeSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskzRangeSd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

IF k[0]
	dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRANGESD'. Intrinsic: '_mm_maskz_range_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzRangeSs ¶

func MaskzRangeSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskzRangeSs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[31:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

IF k[0]
	dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRANGESS'. Intrinsic: '_mm_maskz_range_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReducePd ¶

func MaskzReducePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskzReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm_maskz_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReducePs ¶

func MaskzReducePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskzReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm_maskz_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReduceRoundSd ¶

func MaskzReduceRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskzReduceRoundSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		IF k[0]
			dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_maskz_reduce_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReduceRoundSs ¶

func MaskzReduceRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskzReduceRoundSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}

		IF k[0]
			dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:64] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_maskz_reduce_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReduceSd ¶

func MaskzReduceSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskzReduceSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

IF k[0]
	dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_maskz_reduce_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReduceSs ¶

func MaskzReduceSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskzReduceSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}

IF k[0]
	dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
ELSE
	dst[31:0] := 0
FI
dst[127:64] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_maskz_reduce_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzXorPd ¶

func MaskzXorPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm_maskz_xor_pd'. Requires AVX512DQ.

func MaskzXorPs ¶

func MaskzXorPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm_maskz_xor_ps'. Requires AVX512DQ.

func Movepi32Mask ¶

func Movepi32Mask(a x86.M128i) (dst x86.Mmask8)

Movepi32Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 32-bit integer in 'a'.

FOR j := 0 to 3
	i := j*32
	IF a[i+31]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPMOVD2M'. Intrinsic: '_mm_movepi32_mask'. Requires AVX512DQ.

func Movepi64Mask ¶

func Movepi64Mask(a x86.M128i) (dst x86.Mmask8)

Movepi64Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 64-bit integer in 'a'.

FOR j := 0 to 1
	i := j*64
	IF a[i+63]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPMOVQ2M'. Intrinsic: '_mm_movepi64_mask'. Requires AVX512DQ.

func MovmEpi32 ¶

func MovmEpi32(k x86.Mmask8) (dst x86.M128i)

MovmEpi32: Set each packed 32-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := 0xFFFFFFFF
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVM2D'. Intrinsic: '_mm_movm_epi32'. Requires AVX512DQ.

func MovmEpi64 ¶

func MovmEpi64(k x86.Mmask8) (dst x86.M128i)

MovmEpi64: Set each packed 64-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := 0xFFFFFFFFffffffff
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVM2Q'. Intrinsic: '_mm_movm_epi64'. Requires AVX512DQ.

func MulloEpi64 ¶

func MulloEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

MulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst'.

FOR j := 0 to 1
	i := j*64
	tmp[127:0] := a[i+63:i] * b[i+63:i]
	dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm_mullo_epi64'. Requires AVX512DQ.

func RangePd ¶

func RangePd(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

RangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func RangePs ¶

func RangePs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

RangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func RangeRoundSd ¶

func RangeRoundSd(a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

RangeRoundSd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VRANGESD'. Intrinsic: '_mm_range_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func RangeRoundSs ¶

func RangeRoundSs(a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

RangeRoundSs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[31:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VRANGESS'. Intrinsic: '_mm_range_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReducePd ¶

func ReducePd(a x86.M128d, imm8 byte) (dst x86.M128d)

ReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReducePs ¶

func ReducePs(a x86.M128, imm8 byte) (dst x86.M128)

ReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReduceRoundSd ¶

func ReduceRoundSd(a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

ReduceRoundSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_reduce_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReduceRoundSs ¶

func ReduceRoundSs(a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

ReduceRoundSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}

		dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
		dst[127:64] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_reduce_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReduceSd ¶

func ReduceSd(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

ReduceSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_reduce_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReduceSs ¶

func ReduceSs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

ReduceSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}

dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
dst[127:64] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_reduce_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

Types ¶

This section is empty.

Source Files ¶

View all Source files

avx512dq.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL