kncni

package

v0.0.0-...-3878f85 Latest Latest Go to latest Published: Jul 23, 2017 License: MIT Imports: 1 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/klauspost/intrinsics

Links

Open Source Insights

Documentation ¶

Overview ¶

THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!

THEY DO NOT NOT CONTAIN WORKING INTRINSICS!

See https://github.com/klauspost/intrinsics

Index ¶

func Countbits32(r1 uint32) uint32
func Countbits64(r1 uint64) uint64
func Delay32(r1 uint32)
func Delay64(r1 uint64)
func M512AbsPd(v2 x86.M512d) (dst x86.M512d)
func M512AbsPs(v2 x86.M512) (dst x86.M512)
func M512AdcEpi32(v2 x86.M512i, k2 x86.Mmask16, v3 x86.M512i, k2_res *x86.Mmask16) (dst x86.M512i)
func M512AddEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AddPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512AddPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512AddRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512AddRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512AddnPd(v2 x86.M512d, v3 x86.M512d) (dst x86.M512d)
func M512AddnPs(v2 x86.M512, v3 x86.M512) (dst x86.M512)
func M512AddnRoundPd(v2 x86.M512d, v3 x86.M512d, rounding int) (dst x86.M512d)
func M512AddnRoundPs(v2 x86.M512, v3 x86.M512, rounding int) (dst x86.M512)
func M512AddsetcEpi32(v2 x86.M512i, v3 x86.M512i, k2_res *x86.Mmask16) (dst x86.M512i)
func M512AddsetsEpi32(v2 x86.M512i, v3 x86.M512i, sign *x86.Mmask16) (dst x86.M512i)
func M512AddsetsPs(v2 x86.M512, v3 x86.M512, sign *x86.Mmask16) (dst x86.M512)
func M512AddsetsRoundPs(v2 x86.M512, v3 x86.M512, sign *x86.Mmask16, rounding int) (dst x86.M512)
func M512AlignrEpi32(a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
func M512AndEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AndEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AndSi512(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AndnotEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AndnotEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512AndnotSi512(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512CastpdPs(a x86.M512d) (dst x86.M512)
func M512CastpdSi512(a x86.M512d) (dst x86.M512i)
func M512CastpsPd(a x86.M512) (dst x86.M512d)
func M512CastpsSi512(a x86.M512) (dst x86.M512i)
func M512Castsi512Pd(a x86.M512i) (dst x86.M512d)
func M512Castsi512Ps(a x86.M512i) (dst x86.M512)
func M512CmpEpi32Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask16)
func M512CmpEpu32Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask16)
func M512CmpPdMask(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.Mmask8)
func M512CmpPsMask(a x86.M512, b x86.M512, imm8 byte) (dst x86.Mmask16)
func M512CmpRoundPdMask(a x86.M512d, b x86.M512d, imm8 byte, sae int) (dst x86.Mmask8)
func M512CmpRoundPsMask(a x86.M512, b x86.M512, imm8 byte, sae int) (dst x86.Mmask16)
func M512CmpeqEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpeqEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpeqPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512CmpeqPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512CmpgeEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpgeEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpgtEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpgtEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpleEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpleEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmplePdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512CmplePsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512CmpltEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpltEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpltPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512CmpltPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512CmpneqEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpneqEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512CmpneqPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512CmpneqPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512CmpnlePdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512CmpnlePsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512CmpnltPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512CmpnltPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512CmpordPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512CmpordPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512CmpunordPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512CmpunordPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512CvtRoundpdPslo(v2 x86.M512d, rounding int) (dst x86.M512)
func M512Cvtepi32loPd(v2 x86.M512i) (dst x86.M512d)
func M512Cvtepu32loPd(v2 x86.M512i) (dst x86.M512d)
func M512CvtfxpntRoundAdjustepi32Ps(v2 x86.M512i, rounding int, expadj MMEXPADJENUM) (dst x86.M512)
func M512CvtfxpntRoundAdjustepu32Ps(v2 x86.M512i, rounding int, expadj MMEXPADJENUM) (dst x86.M512)
func M512CvtfxpntRoundAdjustpsEpi32(v2 x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512i)
func M512CvtfxpntRoundAdjustpsEpu32(v2 x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512i)
func M512CvtfxpntRoundpdEpi32lo(v2 x86.M512d, rounding int) (dst x86.M512i)
func M512CvtfxpntRoundpdEpu32lo(v2 x86.M512d, rounding int) (dst x86.M512i)
func M512CvtpdPslo(v2 x86.M512d) (dst x86.M512)
func M512CvtpsloPd(v2 x86.M512) (dst x86.M512d)
func M512Exp223Ps(v2 x86.M512i) (dst x86.M512)
func M512FixupnanPd(v1 x86.M512d, v2 x86.M512d, v3 x86.M512i) (dst x86.M512d)
func M512FixupnanPs(v1 x86.M512, v2 x86.M512, v3 x86.M512i) (dst x86.M512)
func M512Fmadd233Epi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512Fmadd233Ps(a x86.M512, b x86.M512) (dst x86.M512)
func M512Fmadd233RoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512FmaddEpi32(a x86.M512i, b x86.M512i, c x86.M512i) (dst x86.M512i)
func M512FmaddPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512FmaddPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512FmaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512FmaddRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512FmsubPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512FmsubPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512FmsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512FmsubRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512FnmaddPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512FnmaddPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512FnmaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512FnmaddRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512FnmsubPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512FnmsubPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)
func M512FnmsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512FnmsubRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512GetexpPd(a x86.M512d) (dst x86.M512d)
func M512GetexpPs(a x86.M512) (dst x86.M512)
func M512GetexpRoundPd(a x86.M512d, rounding int) (dst x86.M512d)
func M512GetexpRoundPs(a x86.M512, rounding int) (dst x86.M512)
func M512GetmantPd(a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512d)
func M512GetmantPs(a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512)
func M512GetmantRoundPd(a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512d)
func M512GetmantRoundPs(a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512)
func M512GmaxPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512GmaxPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512GmaxabsPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512GminPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512GminPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512Int2mask(mask int) (dst x86.Mmask16)
func M512Kand(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Kandn(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Kandnr(k1 x86.Mmask16, k2 x86.Mmask16) (dst x86.Mmask16)
func M512Kconcathi64(k1 x86.Mmask16, k2 x86.Mmask16) int64
func M512Kconcatlo64(k1 x86.Mmask16, k2 x86.Mmask16) int64
func M512Kextract64(a int64, b int) (dst x86.Mmask16)
func M512Kmerge2l1h(k1 x86.Mmask16, k2 x86.Mmask16) (dst x86.Mmask16)
func M512Kmerge2l1l(k1 x86.Mmask16, k2 x86.Mmask16) (dst x86.Mmask16)
func M512Kmov(a x86.Mmask16) (dst x86.Mmask16)
func M512Kmovlhb(k1 x86.Mmask16, k2 x86.Mmask16) (dst x86.Mmask16)
func M512Knot(a x86.Mmask16) (dst x86.Mmask16)
func M512Kor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Kortestc(k1 x86.Mmask16, k2 x86.Mmask16) int
func M512Kortestz(k1 x86.Mmask16, k2 x86.Mmask16) int
func M512Kswapb(k1 x86.Mmask16, k2 x86.Mmask16) (dst x86.Mmask16)
func M512Kxnor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Kxor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)
func M512Log2Ps(a x86.M512) (dst x86.M512)
func M512Log2ae23Ps(a x86.M512) (dst x86.M512)
func M512Mask2int(k1 x86.Mmask16) int
func M512Mask3FmaddEpi32(a x86.M512i, b x86.M512i, c x86.M512i, k x86.Mmask16) (dst x86.M512i)
func M512Mask3FmaddPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)
func M512Mask3FmaddPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)
func M512Mask3FmaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)
func M512Mask3FmaddRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)
func M512Mask3FmsubPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)
func M512Mask3FmsubPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)
func M512Mask3FmsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)
func M512Mask3FmsubRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)
func M512Mask3FnmaddPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)
func M512Mask3FnmaddPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)
func M512Mask3FnmaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)
func M512Mask3FnmaddRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)
func M512Mask3FnmsubPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)
func M512Mask3FnmsubPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)
func M512Mask3FnmsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)
func M512Mask3FnmsubRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)
func M512MaskAbsPd(src x86.M512d, k x86.Mmask8, v2 x86.M512d) (dst x86.M512d)
func M512MaskAbsPs(src x86.M512, k x86.Mmask16, v2 x86.M512) (dst x86.M512)
func M512MaskAdcEpi32(v2 x86.M512i, k1 x86.Mmask16, k2 x86.Mmask16, v3 x86.M512i, ...) (dst x86.M512i)
func M512MaskAddEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAddPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskAddPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskAddRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MaskAddRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskAddnPd(src x86.M512d, k x86.Mmask8, v2 x86.M512d, v3 x86.M512d) (dst x86.M512d)
func M512MaskAddnPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512) (dst x86.M512)
func M512MaskAddnRoundPd(src x86.M512d, k x86.Mmask8, v2 x86.M512d, v3 x86.M512d, rounding int) (dst x86.M512d)
func M512MaskAddnRoundPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512, rounding int) (dst x86.M512)
func M512MaskAddsetcEpi32(v2 x86.M512i, k x86.Mmask16, k_old x86.Mmask16, v3 x86.M512i, ...) (dst x86.M512i)
func M512MaskAddsetsEpi32(src x86.M512i, k x86.Mmask16, v2 x86.M512i, v3 x86.M512i, sign *x86.Mmask16) (dst x86.M512i)
func M512MaskAddsetsPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512, sign *x86.Mmask16) (dst x86.M512)
func M512MaskAddsetsRoundPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512, sign *x86.Mmask16, ...) (dst x86.M512)
func M512MaskAlignrEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)
func M512MaskAndEpi32(src x86.M512i, k x86.Mmask16, v2 x86.M512i, v3 x86.M512i) (dst x86.M512i)
func M512MaskAndEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAndnotEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskAndnotEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskBlendEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskBlendEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskBlendPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskBlendPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskCmpEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask16)
func M512MaskCmpEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask16)
func M512MaskCmpPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.Mmask8)
func M512MaskCmpPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.Mmask16)
func M512MaskCmpRoundPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte, sae int) (dst x86.Mmask8)
func M512MaskCmpRoundPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512, imm8 byte, sae int) (dst x86.Mmask16)
func M512MaskCmpeqEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpeqEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpeqPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512MaskCmpeqPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512MaskCmpgeEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpgeEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpgtEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpgtEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpleEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpleEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmplePdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512MaskCmplePsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512MaskCmpltEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpltEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpltPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512MaskCmpltPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512MaskCmpneqEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpneqEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskCmpneqPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512MaskCmpneqPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512MaskCmpnlePdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512MaskCmpnlePsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512MaskCmpnltPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512MaskCmpnltPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512MaskCmpordPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512MaskCmpordPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512MaskCmpunordPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)
func M512MaskCmpunordPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)
func M512MaskCvtRoundpdPslo(src x86.M512, k x86.Mmask8, v2 x86.M512d, rounding int) (dst x86.M512)
func M512MaskCvtepi32loPd(src x86.M512d, k x86.Mmask8, v2 x86.M512i) (dst x86.M512d)
func M512MaskCvtepu32loPd(src x86.M512d, k x86.Mmask8, v2 x86.M512i) (dst x86.M512d)
func M512MaskCvtfxpntRoundAdjustepu32Ps(src x86.M512, k x86.Mmask16, v2 x86.M512i, rounding int, expadj MMEXPADJENUM) (dst x86.M512)
func M512MaskCvtfxpntRoundpdEpi32lo(src x86.M512i, k x86.Mmask8, v2 x86.M512d, rounding int) (dst x86.M512i)
func M512MaskCvtfxpntRoundpdEpu32lo(src x86.M512i, k x86.Mmask8, v2 x86.M512d, rounding int) (dst x86.M512i)
func M512MaskCvtpdPslo(src x86.M512, k x86.Mmask8, v2 x86.M512d) (dst x86.M512)
func M512MaskCvtpsloPd(src x86.M512d, k x86.Mmask8, v2 x86.M512) (dst x86.M512d)
func M512MaskExp223Ps(src x86.M512, k x86.Mmask16, v2 x86.M512i) (dst x86.M512)
func M512MaskFixupnanPd(v1 x86.M512d, k x86.Mmask8, v2 x86.M512d, v3 x86.M512i) (dst x86.M512d)
func M512MaskFixupnanPs(v1 x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512i) (dst x86.M512)
func M512MaskFmadd233Epi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskFmadd233Ps(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskFmadd233RoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskFmaddEpi32(a x86.M512i, k x86.Mmask16, b x86.M512i, c x86.M512i) (dst x86.M512i)
func M512MaskFmaddPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskFmaddPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskFmaddRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskFmaddRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskFmsubPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskFmsubPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskFmsubRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskFmsubRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskFnmaddPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskFnmaddPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskFnmaddRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskFnmaddRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskFnmsubPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)
func M512MaskFnmsubPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)
func M512MaskFnmsubRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)
func M512MaskFnmsubRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)
func M512MaskGetexpPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskGetexpPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskGetexpRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)
func M512MaskGetexpRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)
func M512MaskGetmantPd(src x86.M512d, k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, ...) (dst x86.M512d)
func M512MaskGetmantPs(src x86.M512, k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, ...) (dst x86.M512)
func M512MaskGetmantRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, ...) (dst x86.M512d)
func M512MaskGetmantRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, ...) (dst x86.M512)
func M512MaskGmaxPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskGmaxPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskGmaxabsPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskGminPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskGminPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskLog2Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskLog2ae23Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskMaxEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMaxEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMaxabsPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskMinEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMinEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMovEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)
func M512MaskMovEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)
func M512MaskMovPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)
func M512MaskMovPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskMulPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskMulPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskMulRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MaskMulRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskMulhiEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMulhiEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskMulloEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskOrEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskOrEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskPermute4f128Epi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskPermute4f128Ps(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)
func M512MaskPermutevarEpi32(src x86.M512i, k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512MaskRcp23Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskReduceAddEpi32(k x86.Mmask16, a x86.M512i) int
func M512MaskReduceAddEpi64(k x86.Mmask8, a x86.M512i) int64
func M512MaskReduceAddPd(k x86.Mmask8, a x86.M512d) float64
func M512MaskReduceAddPs(k x86.Mmask16, a x86.M512) float32
func M512MaskReduceAndEpi32(k x86.Mmask16, a x86.M512i) int
func M512MaskReduceAndEpi64(k x86.Mmask8, a x86.M512i) int64
func M512MaskReduceGmaxPd(k x86.Mmask8, a x86.M512d) float64
func M512MaskReduceGmaxPs(k x86.Mmask16, a x86.M512) float32
func M512MaskReduceGminPd(k x86.Mmask8, a x86.M512d) float64
func M512MaskReduceGminPs(k x86.Mmask16, a x86.M512) float32
func M512MaskReduceMaxEpi32(k x86.Mmask16, a x86.M512i) int
func M512MaskReduceMaxEpi64(k x86.Mmask8, a x86.M512i) int64
func M512MaskReduceMaxEpu32(k x86.Mmask16, a x86.M512i) uint32
func M512MaskReduceMaxEpu64(k x86.Mmask8, a x86.M512i) uint64
func M512MaskReduceMaxPd(k x86.Mmask8, a x86.M512d) float64
func M512MaskReduceMaxPs(k x86.Mmask16, a x86.M512) float32
func M512MaskReduceMinEpi32(k x86.Mmask16, a x86.M512i) int
func M512MaskReduceMinEpi64(k x86.Mmask8, a x86.M512i) int64
func M512MaskReduceMinEpu32(k x86.Mmask16, a x86.M512i) uint32
func M512MaskReduceMinEpu64(k x86.Mmask8, a x86.M512i) uint64
func M512MaskReduceMinPd(k x86.Mmask8, a x86.M512d) float64
func M512MaskReduceMinPs(k x86.Mmask16, a x86.M512) float32
func M512MaskReduceMulEpi32(k x86.Mmask16, a x86.M512i) int
func M512MaskReduceMulEpi64(k x86.Mmask8, a x86.M512i) int64
func M512MaskReduceMulPd(k x86.Mmask8, a x86.M512d) float64
func M512MaskReduceMulPs(k x86.Mmask16, a x86.M512) float32
func M512MaskReduceOrEpi32(k x86.Mmask16, a x86.M512i) int
func M512MaskReduceOrEpi64(k x86.Mmask8, a x86.M512i) int64
func M512MaskRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512)
func M512MaskRoundfxpntAdjustPd(src x86.M512d, k x86.Mmask8, a x86.M512d, rounding int, expadj MMEXPADJENUM) (dst x86.M512d)
func M512MaskRoundfxpntAdjustPs(src x86.M512, k x86.Mmask16, a x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512)
func M512MaskRsqrt23Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)
func M512MaskSbbEpi32(v2 x86.M512i, k1 x86.Mmask16, k2 x86.Mmask16, v3 x86.M512i, ...) (dst x86.M512i)
func M512MaskSbbrEpi32(v2 x86.M512i, k1 x86.Mmask16, k2 x86.Mmask16, v3 x86.M512i, ...) (dst x86.M512i)
func M512MaskScalePs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512i) (dst x86.M512)
func M512MaskScaleRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512i, rounding int) (dst x86.M512)
func M512MaskShuffleEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSlliEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSllvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskSraiEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSravEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskSrliEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)
func M512MaskSrlvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512MaskSubEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskSubPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MaskSubPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)
func M512MaskSubRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MaskSubRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MaskSubrEpi32(src x86.M512i, k x86.Mmask16, v2 x86.M512i, v3 x86.M512i) (dst x86.M512i)
func M512MaskSubrPd(src x86.M512d, k x86.Mmask8, v2 x86.M512d, v3 x86.M512d) (dst x86.M512d)
func M512MaskSubrPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512) (dst x86.M512)
func M512MaskSubrRoundPd(src x86.M512d, k x86.Mmask8, v2 x86.M512d, v3 x86.M512d, rounding int) (dst x86.M512d)
func M512MaskSubrRoundPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512, rounding int) (dst x86.M512)
func M512MaskSubrsetbEpi32(v2 x86.M512i, k x86.Mmask16, k_old x86.Mmask16, v3 x86.M512i, ...) (dst x86.M512i)
func M512MaskSubsetbEpi32(v2 x86.M512i, k x86.Mmask16, k_old x86.Mmask16, v3 x86.M512i, ...) (dst x86.M512i)
func M512MaskSwizzleEpi32(src x86.M512i, k x86.Mmask16, v x86.M512i, s MMSWIZZLEENUM) (dst x86.M512i)
func M512MaskSwizzleEpi64(src x86.M512i, k x86.Mmask8, v x86.M512i, s MMSWIZZLEENUM) (dst x86.M512i)
func M512MaskSwizzlePd(src x86.M512d, k x86.Mmask8, v x86.M512d, s MMSWIZZLEENUM) (dst x86.M512d)
func M512MaskSwizzlePs(src x86.M512, k x86.Mmask16, v x86.M512, s MMSWIZZLEENUM) (dst x86.M512)
func M512MaskTestEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512MaskXorEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaskXorEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaxEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaxEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MaxabsPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512MinEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MinEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MulPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512MulPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512MulRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512MulRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512MulhiEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MulhiEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512MulloEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512OrEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512OrEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512OrSi512(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512Permute4f128Epi32(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512Permute4f128Ps(a x86.M512, imm8 byte) (dst x86.M512)
func M512PermutevarEpi32(idx x86.M512i, a x86.M512i) (dst x86.M512i)
func M512Rcp23Ps(a x86.M512) (dst x86.M512)
func M512ReduceAddEpi32(a x86.M512i) int
func M512ReduceAddEpi64(a x86.M512i) int64
func M512ReduceAddPd(a x86.M512d) float64
func M512ReduceAddPs(a x86.M512) float32
func M512ReduceAndEpi32(a x86.M512i) int
func M512ReduceAndEpi64(a x86.M512i) int64
func M512ReduceGmaxPd(a x86.M512d) float64
func M512ReduceGmaxPs(a x86.M512) float32
func M512ReduceGminPd(a x86.M512d) float64
func M512ReduceGminPs(a x86.M512) float32
func M512ReduceMaxEpi32(a x86.M512i) int
func M512ReduceMaxEpi64(a x86.M512i) int64
func M512ReduceMaxEpu32(a x86.M512i) uint32
func M512ReduceMaxEpu64(a x86.M512i) uint64
func M512ReduceMaxPd(a x86.M512d) float64
func M512ReduceMaxPs(a x86.M512) float32
func M512ReduceMinEpi32(a x86.M512i) int
func M512ReduceMinEpi64(a x86.M512i) int64
func M512ReduceMinEpu32(a x86.M512i) uint32
func M512ReduceMinEpu64(a x86.M512i) uint64
func M512ReduceMinPd(a x86.M512d) float64
func M512ReduceMinPs(a x86.M512) float32
func M512ReduceMulEpi32(a x86.M512i) int
func M512ReduceMulEpi64(a x86.M512i) int64
func M512ReduceMulPd(a x86.M512d) float64
func M512ReduceMulPs(a x86.M512) float32
func M512ReduceOrEpi32(a x86.M512i) int
func M512ReduceOrEpi64(a x86.M512i) int64
func M512RoundPs(a x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512)
func M512RoundfxpntAdjustPd(a x86.M512d, rounding int, expadj MMEXPADJENUM) (dst x86.M512d)
func M512RoundfxpntAdjustPs(a x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512)
func M512Rsqrt23Ps(a x86.M512) (dst x86.M512)
func M512SbbEpi32(v2 x86.M512i, k x86.Mmask16, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)
func M512SbbrEpi32(v2 x86.M512i, k x86.Mmask16, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)
func M512ScalePs(a x86.M512, b x86.M512i) (dst x86.M512)
func M512ScaleRoundPs(a x86.M512, b x86.M512i, rounding int) (dst x86.M512)
func M512ShuffleEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SlliEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SllvEpi32(a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512SraiEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SravEpi32(a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512SrliEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)
func M512SrlvEpi32(a x86.M512i, count x86.M512i) (dst x86.M512i)
func M512SubEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512SubPd(a x86.M512d, b x86.M512d) (dst x86.M512d)
func M512SubPs(a x86.M512, b x86.M512) (dst x86.M512)
func M512SubRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)
func M512SubRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)
func M512SubrEpi32(v2 x86.M512i, v3 x86.M512i) (dst x86.M512i)
func M512SubrPd(v2 x86.M512d, v3 x86.M512d) (dst x86.M512d)
func M512SubrPs(v2 x86.M512, v3 x86.M512) (dst x86.M512)
func M512SubrRoundPd(v2 x86.M512d, v3 x86.M512d, rounding int) (dst x86.M512d)
func M512SubrRoundPs(v2 x86.M512, v3 x86.M512, rounding int) (dst x86.M512)
func M512SubrsetbEpi32(v2 x86.M512i, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)
func M512SubsetbEpi32(v2 x86.M512i, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)
func M512SwizzleEpi32(v x86.M512i, s MMSWIZZLEENUM) (dst x86.M512i)
func M512SwizzleEpi64(v x86.M512i, s MMSWIZZLEENUM) (dst x86.M512i)
func M512SwizzlePd(v x86.M512d, s MMSWIZZLEENUM) (dst x86.M512d)
func M512SwizzlePs(v x86.M512, s MMSWIZZLEENUM) (dst x86.M512)
func M512TestEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)
func M512XorEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512XorEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)
func M512XorSi512(a x86.M512i, b x86.M512i) (dst x86.M512i)
func Prefetch(p *byte, i int)
func Spflt32(r1 uint32)
func Spflt64(r1 uint64)
func Tzcnti32(a int, x uint32) int
func Tzcnti64(a int64, x uint64) int64

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func Countbits32 ¶

func Countbits32(r1 uint32) uint32

Countbits32: Counts the number of set bits in 32-bit unsigned integer 'r1', returning the results in 'dst'.

dst[31:0] := PopCount(r1[31:0])

Instruction: 'POPCNT'. Intrinsic: '_mm_countbits_32'. Requires KNCNI.

func Countbits64 ¶

func Countbits64(r1 uint64) uint64

Countbits64: Counts the number of set bits in double-precision (32-bit) unsigned integer 'r1', returning the results in 'dst'.

dst[63:0] := PopCount(r1[63:0])

Instruction: 'POPCNT'. Intrinsic: '_mm_countbits_64'. Requires KNCNI.

func Delay32 ¶

func Delay32(r1 uint32)

Delay32: Stalls a thread without blocking other threads for 32-bit unsigned integer 'r1' clock cycles.

BlockThread(r1)

Instruction: 'DELAY'. Intrinsic: '_mm_delay_32'. Requires KNCNI.

func Delay64 ¶

func Delay64(r1 uint64)

Delay64: Stalls a thread without blocking other threads for 64-bit unsigned integer 'r1' clock cycles.

BlockThread(r1)

Instruction: 'DELAY'. Intrinsic: '_mm_delay_64'. Requires KNCNI.

func M512AbsPd ¶

func M512AbsPd(v2 x86.M512d) (dst x86.M512d)

M512AbsPd: Finds the absolute value of each packed double-precision (64-bit) floating-point element in 'v2', storing the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ABS(v2[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm512_abs_pd'. Requires KNCNI.

func M512AbsPs ¶

func M512AbsPs(v2 x86.M512) (dst x86.M512)

M512AbsPs: Finds the absolute value of each packed single-precision (32-bit) floating-point element in 'v2', storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ABS(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm512_abs_ps'. Requires KNCNI.

func M512AdcEpi32 ¶

func M512AdcEpi32(v2 x86.M512i, k2 x86.Mmask16, v3 x86.M512i, k2_res *x86.Mmask16) (dst x86.M512i)

M512AdcEpi32: Performs element-by-element addition of packed 32-bit integers in 'v2' and 'v3' and the corresponding bit in 'k2', storing the result of the addition in 'dst' and the result of the carry in 'k2_res'.

FOR j := 0 to 15
	i := j*32
	k2_res[j]   := Carry(v2[i+31:i] + v3[i+31:i] + k2[j])
	dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADCD'. Intrinsic: '_mm512_adc_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512AddEpi32 ¶

func M512AddEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm512_add_epi32'. Requires KNCNI.

func M512AddPd ¶

func M512AddPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512AddPd: Add packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDPD'. Intrinsic: '_mm512_add_pd'. Requires KNCNI.

func M512AddPs ¶

func M512AddPs(a x86.M512, b x86.M512) (dst x86.M512)

M512AddPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDPS'. Intrinsic: '_mm512_add_ps'. Requires KNCNI.

func M512AddRoundPd ¶

func M512AddRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512AddRoundPd: Add packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := a[i+63:i] + b[i+63:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDPD'. Intrinsic: '_mm512_add_round_pd'. Requires KNCNI.

func M512AddRoundPs ¶

func M512AddRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512AddRoundPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := a[i+31:i] + b[i+31:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDPS'. Intrinsic: '_mm512_add_round_ps'. Requires KNCNI.

func M512AddnPd ¶

func M512AddnPd(v2 x86.M512d, v3 x86.M512d) (dst x86.M512d)

M512AddnPd: Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in 'v2' and 'v3' and negates their sum, storing the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDNPD'. Intrinsic: '_mm512_addn_pd'. Requires KNCNI.

func M512AddnPs ¶

func M512AddnPs(v2 x86.M512, v3 x86.M512) (dst x86.M512)

M512AddnPs: Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in 'v2' and 'v3' and negates their sum, storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDNPS'. Intrinsic: '_mm512_addn_ps'. Requires KNCNI.

func M512AddnRoundPd ¶

func M512AddnRoundPd(v2 x86.M512d, v3 x86.M512d, rounding int) (dst x86.M512d)

M512AddnRoundPd: Performs element by element addition between packed double-precision (64-bit) floating-point elements in 'v2' and 'v3' and negates the sum, storing the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDNPD'. Intrinsic: '_mm512_addn_round_pd'. Requires KNCNI.

func M512AddnRoundPs ¶

func M512AddnRoundPs(v2 x86.M512, v3 x86.M512, rounding int) (dst x86.M512)

M512AddnRoundPs: Performs element by element addition between packed single-precision (32-bit) floating-point elements in 'v2' and 'v3' and negates the sum, storing the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDNPS'. Intrinsic: '_mm512_addn_round_ps'. Requires KNCNI.

func M512AddsetcEpi32 ¶

func M512AddsetcEpi32(v2 x86.M512i, v3 x86.M512i, k2_res *x86.Mmask16) (dst x86.M512i)

M512AddsetcEpi32: Performs element-by-element addition of packed 32-bit integer elements in 'v2' and 'v3', storing the resultant carry in 'k2_res' (carry flag) and the addition results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
	k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSETCD'. Intrinsic: '_mm512_addsetc_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512AddsetsEpi32 ¶

func M512AddsetsEpi32(v2 x86.M512i, v3 x86.M512i, sign *x86.Mmask16) (dst x86.M512i)

M512AddsetsEpi32: Performs an element-by-element addition of packed 32-bit integer elements in 'v2' and 'v3', storing the results in 'dst' and the sign of the sum in 'sign' (sign flag).

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
	sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSETSD'. Intrinsic: '_mm512_addsets_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512AddsetsPs ¶

func M512AddsetsPs(v2 x86.M512, v3 x86.M512, sign *x86.Mmask16) (dst x86.M512)

M512AddsetsPs: Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in 'v2' and 'v3', storing the results in 'dst' and the sign of the sum in 'sign' (sign flag).

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
	sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDSETSPS'. Intrinsic: '_mm512_addsets_ps'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512AddsetsRoundPs ¶

func M512AddsetsRoundPs(v2 x86.M512, v3 x86.M512, sign *x86.Mmask16, rounding int) (dst x86.M512)

M512AddsetsRoundPs: Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in 'v2' and 'v3', storing the results in 'dst' and the sign of the sum in 'sign' (sign flag).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
			sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDSETSPS'. Intrinsic: '_mm512_addsets_round_ps'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512AlignrEpi32 ¶

func M512AlignrEpi32(a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512AlignrEpi32: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 32-bit elements, and store the low 64 bytes (16 elements) in 'dst'.

temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (32*count)
dst[511:0] := temp[511:0]
dst[MAX:512] := 0

Instruction: 'VALIGND'. Intrinsic: '_mm512_alignr_epi32'. Requires KNCNI.

func M512AndEpi32 ¶

func M512AndEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm512_and_epi32'. Requires KNCNI.

func M512AndEpi64 ¶

func M512AndEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AndEpi64: Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in 'a' and 'b', and store the results in 'dst'.

dst[511:0] := (a[511:0] AND b[511:0])
dst[MAX:512] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm512_and_epi64'. Requires KNCNI.

func M512AndSi512 ¶

func M512AndSi512(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AndSi512: Compute the bitwise AND of 512 bits (representing integer data) in 'a' and 'b', and store the result in 'dst'.

dst[511:0] := (a[511:0] AND b[511:0])
dst[MAX:512] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm512_and_si512'. Requires KNCNI.

func M512AndnotEpi32 ¶

func M512AndnotEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm512_andnot_epi32'. Requires KNCNI.

func M512AndnotEpi64 ¶

func M512AndnotEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AndnotEpi64: Compute the bitwise AND NOT of 512 bits (composed of packed 64-bit integers) in 'a' and 'b', and store the results in 'dst'.

dst[511:0] := ((NOT a[511:0]) AND b[511:0])
dst[MAX:512] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm512_andnot_epi64'. Requires KNCNI.

func M512AndnotSi512 ¶

func M512AndnotSi512(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AndnotSi512: Compute the bitwise AND NOT of 512 bits (representing integer data) in 'a' and 'b', and store the result in 'dst'.

dst[511:0] := ((NOT a[511:0]) AND b[511:0])
dst[MAX:512] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm512_andnot_si512'. Requires KNCNI.

func M512CastpdPs ¶

func M512CastpdPs(a x86.M512d) (dst x86.M512)

M512CastpdPs: Cast vector of type __m512d to type __m512.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castpd_ps'. Requires KNCNI.

func M512CastpdSi512 ¶

func M512CastpdSi512(a x86.M512d) (dst x86.M512i)

M512CastpdSi512: Cast vector of type __m512d to type __m512i.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castpd_si512'. Requires KNCNI.

func M512CastpsPd ¶

func M512CastpsPd(a x86.M512) (dst x86.M512d)

M512CastpsPd: Cast vector of type __m512 to type __m512d.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castps_pd'. Requires KNCNI.

func M512CastpsSi512 ¶

func M512CastpsSi512(a x86.M512) (dst x86.M512i)

M512CastpsSi512: Cast vector of type __m512 to type __m512i.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castps_si512'. Requires KNCNI.

func M512Castsi512Pd ¶

func M512Castsi512Pd(a x86.M512i) (dst x86.M512d)

M512Castsi512Pd: Cast vector of type __m512i to type __m512d.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castsi512_pd'. Requires KNCNI.

func M512Castsi512Ps ¶

func M512Castsi512Ps(a x86.M512i) (dst x86.M512)

M512Castsi512Ps: Cast vector of type __m512i to type __m512.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castsi512_ps'. Requires KNCNI.

func M512CmpEpi32Mask ¶

func M512CmpEpi32Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask16)

M512CmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_cmp_epi32_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512CmpEpu32Mask ¶

func M512CmpEpu32Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask16)

M512CmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_cmp_epu32_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512CmpPdMask ¶

func M512CmpPdMask(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.Mmask8)

M512CmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
	i := j*64
	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_cmp_pd_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512CmpPsMask ¶

func M512CmpPsMask(a x86.M512, b x86.M512, imm8 byte) (dst x86.Mmask16)

M512CmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 15
	i := j*32
	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_cmp_ps_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512CmpRoundPdMask ¶

func M512CmpRoundPdMask(a x86.M512d, b x86.M512d, imm8 byte, sae int) (dst x86.Mmask8)

M512CmpRoundPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC
	FOR j := 0 to 7
		i := j*64
		k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
	ENDFOR
	k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_cmp_round_pd_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512CmpRoundPsMask ¶

func M512CmpRoundPsMask(a x86.M512, b x86.M512, imm8 byte, sae int) (dst x86.Mmask16)

M512CmpRoundPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC
	FOR j := 0 to 15
		i := j*32
		k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
	ENDFOR
	k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_cmp_round_ps_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512CmpeqEpi32Mask ¶

func M512CmpeqEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPEQD'. Intrinsic: '_mm512_cmpeq_epi32_mask'. Requires KNCNI.

func M512CmpeqEpu32Mask ¶

func M512CmpeqEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_cmpeq_epu32_mask'. Requires KNCNI.

func M512CmpeqPdMask ¶

func M512CmpeqPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512CmpeqPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_cmpeq_pd_mask'. Requires KNCNI.

func M512CmpeqPsMask ¶

func M512CmpeqPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)

M512CmpeqPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_cmpeq_ps_mask'. Requires KNCNI.

func M512CmpgeEpi32Mask ¶

func M512CmpgeEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_cmpge_epi32_mask'. Requires KNCNI.

func M512CmpgeEpu32Mask ¶

func M512CmpgeEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_cmpge_epu32_mask'. Requires KNCNI.

func M512CmpgtEpi32Mask ¶

func M512CmpgtEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPGTD'. Intrinsic: '_mm512_cmpgt_epi32_mask'. Requires KNCNI.

func M512CmpgtEpu32Mask ¶

func M512CmpgtEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_cmpgt_epu32_mask'. Requires KNCNI.

func M512CmpleEpi32Mask ¶

func M512CmpleEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_cmple_epi32_mask'. Requires KNCNI.

func M512CmpleEpu32Mask ¶

func M512CmpleEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_cmple_epu32_mask'. Requires KNCNI.

func M512CmplePdMask ¶

func M512CmplePdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512CmplePdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_cmple_pd_mask'. Requires KNCNI.

func M512CmplePsMask ¶

func M512CmplePsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)

M512CmplePsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_cmple_ps_mask'. Requires KNCNI.

func M512CmpltEpi32Mask ¶

func M512CmpltEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPLTD'. Intrinsic: '_mm512_cmplt_epi32_mask'. Requires KNCNI.

func M512CmpltEpu32Mask ¶

func M512CmpltEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_cmplt_epu32_mask'. Requires KNCNI.

func M512CmpltPdMask ¶

func M512CmpltPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512CmpltPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_cmplt_pd_mask'. Requires KNCNI.

func M512CmpltPsMask ¶

func M512CmpltPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)

M512CmpltPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_cmplt_ps_mask'. Requires KNCNI.

func M512CmpneqEpi32Mask ¶

func M512CmpneqEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_cmpneq_epi32_mask'. Requires KNCNI.

func M512CmpneqEpu32Mask ¶

func M512CmpneqEpu32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_cmpneq_epu32_mask'. Requires KNCNI.

func M512CmpneqPdMask ¶

func M512CmpneqPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512CmpneqPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_cmpneq_pd_mask'. Requires KNCNI.

func M512CmpneqPsMask ¶

func M512CmpneqPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)

M512CmpneqPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_cmpneq_ps_mask'. Requires KNCNI.

func M512CmpnlePdMask ¶

func M512CmpnlePdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512CmpnlePdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for not-less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := !(a[i+63:i] <= b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_cmpnle_pd_mask'. Requires KNCNI.

func M512CmpnlePsMask ¶

func M512CmpnlePsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)

M512CmpnlePsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := !(a[i+31:i] <= b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_cmpnle_ps_mask'. Requires KNCNI.

func M512CmpnltPdMask ¶

func M512CmpnltPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512CmpnltPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for not-less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := !(a[i+63:i] < b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_cmpnlt_pd_mask'. Requires KNCNI.

func M512CmpnltPsMask ¶

func M512CmpnltPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)

M512CmpnltPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := !(a[i+31:i] < b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_cmpnlt_ps_mask'. Requires KNCNI.

func M512CmpordPdMask ¶

func M512CmpordPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512CmpordPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' to see if neither is NaN, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_cmpord_pd_mask'. Requires KNCNI.

func M512CmpordPsMask ¶

func M512CmpordPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)

M512CmpordPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' to see if neither is NaN, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := (a[i+31:i] != NaN AND b[i+31:i] != NaN) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_cmpord_ps_mask'. Requires KNCNI.

func M512CmpunordPdMask ¶

func M512CmpunordPdMask(a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512CmpunordPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' to see if either is NaN, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_cmpunord_pd_mask'. Requires KNCNI.

func M512CmpunordPsMask ¶

func M512CmpunordPsMask(a x86.M512, b x86.M512) (dst x86.Mmask16)

M512CmpunordPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' to see if either is NaN, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := (a[i+31:i] == NaN OR b[i+31:i] == NaN) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_cmpunord_ps_mask'. Requires KNCNI.

func M512CvtRoundpdPslo ¶

func M512CvtRoundpdPslo(v2 x86.M512d, rounding int) (dst x86.M512)

M512CvtRoundpdPslo: Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in 'v2' to packed single-precision (32-bit) floating-point elements, storing the results in 'dst'. Results are written to the lower half of 'dst', and the upper half locations are set to '0'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			k := j*32
			dst[k+31:k] := Float64ToFloat32(v2[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_cvt_roundpd_pslo'. Requires KNCNI.

func M512Cvtepi32loPd ¶

func M512Cvtepi32loPd(v2 x86.M512i) (dst x86.M512d)

M512Cvtepi32loPd: Performs element-by-element conversion of the lower half of packed 32-bit integer elements in 'v2' to packed double-precision (64-bit) floating-point elements, storing the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	l := j*64
	dst[l+63:l] := Int32ToFloat64(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_cvtepi32lo_pd'. Requires KNCNI.

func M512Cvtepu32loPd ¶

func M512Cvtepu32loPd(v2 x86.M512i) (dst x86.M512d)

M512Cvtepu32loPd: Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in 'v2' to packed double-precision (64-bit) floating-point elements, storing the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	k := j*64
	dst[k+63:k] := UInt32ToFloat64(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_cvtepu32lo_pd'. Requires KNCNI.

func M512CvtfxpntRoundAdjustepi32Ps ¶

func M512CvtfxpntRoundAdjustepi32Ps(v2 x86.M512i, rounding int, expadj MMEXPADJENUM) (dst x86.M512)

M512CvtfxpntRoundAdjustepi32Ps: Performs element-by-element conversion of packed 32-bit integer elements in 'v2' to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using 'expadj', storing the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := Int32ToFloat32(v2[i+31:i])
			CASE expadj OF
			_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
			_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
			_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
			_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
			_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
			_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
			_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
			_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
			ESAC
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTFXPNTDQ2PS'. Intrinsic: '_mm512_cvtfxpnt_round_adjustepi32_ps'. Requires KNCNI.

func M512CvtfxpntRoundAdjustepu32Ps ¶

func M512CvtfxpntRoundAdjustepu32Ps(v2 x86.M512i, rounding int, expadj MMEXPADJENUM) (dst x86.M512)

M512CvtfxpntRoundAdjustepu32Ps: Performs element-by-element conversion of packed 32-bit unsigned integer elements in 'v2' to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using 'expadj', storing the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := UInt32ToFloat32(v2[i+31:i])
			CASE expadj OF
			_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
			_MM_EXPADJ_4:	 dst[i+31:i] = dst[i+31:i] * 2**4
			_MM_EXPADJ_5:	 dst[i+31:i] = dst[i+31:i] * 2**5
			_MM_EXPADJ_8:	 dst[i+31:i] = dst[i+31:i] * 2**8
			_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
			_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
			_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
			_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
			ESAC
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTFXPNTUDQ2PS'. Intrinsic: '_mm512_cvtfxpnt_round_adjustepu32_ps'. Requires KNCNI.

func M512CvtfxpntRoundAdjustpsEpi32 ¶

func M512CvtfxpntRoundAdjustpsEpi32(v2 x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512i)

M512CvtfxpntRoundAdjustpsEpi32: Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in 'v2' to packed 32-bit integer elements and performs an optional exponent adjust using 'expadj', storing the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := Float32ToInt32(v2[i+31:i])
			CASE expadj OF
			_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
			_MM_EXPADJ_4:	 dst[i+31:i] = dst[i+31:i] * 2**4
			_MM_EXPADJ_5:	 dst[i+31:i] = dst[i+31:i] * 2**5
			_MM_EXPADJ_8:	 dst[i+31:i] = dst[i+31:i] * 2**8
			_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
			_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
			_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
			_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
			ESAC
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTFXPNTPS2DQ'. Intrinsic: '_mm512_cvtfxpnt_round_adjustps_epi32'. Requires KNCNI.

func M512CvtfxpntRoundAdjustpsEpu32 ¶

func M512CvtfxpntRoundAdjustpsEpu32(v2 x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512i)

M512CvtfxpntRoundAdjustpsEpu32: Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in 'v2' to packed 32-bit unsigned integer elements and performing an optional exponent adjust using 'expadj', storing the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := Float32ToUInt32(v2[i+31:i])
			CASE expadj OF
			_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i]  0
			_MM_EXPADJ_4:	 dst[i+31:i] = dst[i+31:i]  4
			_MM_EXPADJ_5:	 dst[i+31:i] = dst[i+31:i]  5
			_MM_EXPADJ_8:	 dst[i+31:i] = dst[i+31:i]  8
			_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i]  16
			_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i]  24
			_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i]  31
			_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i]  32
			ESAC
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTFXPNTPS2UDQ'. Intrinsic: '_mm512_cvtfxpnt_round_adjustps_epu32'. Requires KNCNI.

func M512CvtfxpntRoundpdEpi32lo ¶

func M512CvtfxpntRoundpdEpi32lo(v2 x86.M512d, rounding int) (dst x86.M512i)

M512CvtfxpntRoundpdEpi32lo: Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector 'v2' to 32-bit integer elements, storing them in the lower half of 'dst'. The elements in the upper half of 'dst' are set to 0.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			k := j*32
			dst[k+31:k] := Float64ToInt32(v2[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTFXPNTPD2DQ'. Intrinsic: '_mm512_cvtfxpnt_roundpd_epi32lo'. Requires KNCNI.

func M512CvtfxpntRoundpdEpu32lo ¶

func M512CvtfxpntRoundpdEpu32lo(v2 x86.M512d, rounding int) (dst x86.M512i)

M512CvtfxpntRoundpdEpu32lo: Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in 'v2' to packed 32-bit unsigned integer elements, storing the results in 'dst'. Results are written to the lower half of 'dst', and the upper half locations are set to '0'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			k := j*32
			dst[k+31:k] := Float64ToInt32(v2[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTFXPNTPD2UDQ'. Intrinsic: '_mm512_cvtfxpnt_roundpd_epu32lo'. Requires KNCNI.

func M512CvtpdPslo ¶

func M512CvtpdPslo(v2 x86.M512d) (dst x86.M512)

M512CvtpdPslo: Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in 'v2' to single-precision (32-bit) floating-point elements and stores them in 'dst'. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.

FOR j := 0 to 7
	i := j*64
	k := j*32
	dst[k+31:k] := Float64ToFloat32(v2[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_cvtpd_pslo'. Requires KNCNI.

func M512CvtpsloPd ¶

func M512CvtpsloPd(v2 x86.M512) (dst x86.M512d)

M512CvtpsloPd: Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in 'v2' to packed double-precision (64-bit) floating-point elements, storing the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	k := j*64
	dst[k+63:k] := Float32ToFloat64(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_cvtpslo_pd'. Requires KNCNI.

func M512Exp223Ps ¶

func M512Exp223Ps(v2 x86.M512i) (dst x86.M512)

M512Exp223Ps: Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in 'v2' with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := exp223(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VEXP223PS'. Intrinsic: '_mm512_exp223_ps'. Requires KNCNI.

func M512FixupnanPd ¶

func M512FixupnanPd(v1 x86.M512d, v2 x86.M512d, v3 x86.M512i) (dst x86.M512d)

M512FixupnanPd: Fixes up NaN's from packed double-precision (64-bit) floating-point elements in 'v1' and 'v2', storing the results in 'dst' and storing the quietized NaN's from 'v1' in 'v3'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i])
	v3[i+63:i] := QuietizeNaNs(v1[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPNANPD'. Intrinsic: '_mm512_fixupnan_pd'. Requires KNCNI.

func M512FixupnanPs ¶

func M512FixupnanPs(v1 x86.M512, v2 x86.M512, v3 x86.M512i) (dst x86.M512)

M512FixupnanPs: Fixes up NaN's from packed single-precision (32-bit) floating-point elements in 'v1' and 'v2', storing the results in 'dst' and storing the quietized NaN's from 'v1' in 'v3'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i])
	v3[i+31:i] := QuietizeNaNs(v1[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPNANPS'. Intrinsic: '_mm512_fixupnan_ps'. Requires KNCNI.

func M512Fmadd233Epi32 ¶

func M512Fmadd233Epi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512Fmadd233Epi32: Multiply packed 32-bit integer elements in each 4-element set of 'a' and by element 1 of the corresponding 4-element set from 'b', add the intermediate result to element 0 of the corresponding 4-element set from 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	base := (j & ~0x3) * 32
	scale[31:0] := b[base+63:base+32]
	bias[31:0]  := b[base+31:base]
	dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADD233D'. Intrinsic: '_mm512_fmadd233_epi32'. Requires KNCNI.

func M512Fmadd233Ps ¶

func M512Fmadd233Ps(a x86.M512, b x86.M512) (dst x86.M512)

M512Fmadd233Ps: Performs multiplication between single-precision (32-bit) floating-point elements in 'a' and 'b' and adds the result to the elements in 'b', storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD233PS'. Intrinsic: '_mm512_fmadd233_ps'. Requires KNCNI.

func M512Fmadd233RoundPs ¶

func M512Fmadd233RoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512Fmadd233RoundPs: Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of 'a' and by element 1 of the corresponding 4-element set from 'b', add the intermediate result to element 0 of the corresponding 4-element set from 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			base := (j & ~0x3) * 32
			scale[31:0] := b[base+63:base+32]
			bias[31:0]  := b[base+31:base]
			dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD233PS'. Intrinsic: '_mm512_fmadd233_round_ps'. Requires KNCNI.

func M512FmaddEpi32 ¶

func M512FmaddEpi32(a x86.M512i, b x86.M512i, c x86.M512i) (dst x86.M512i)

M512FmaddEpi32: Multiply packed 32-bit integer elements in 'a' and 'b', add the intermediate result to packed elements in 'c' and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADD231D'. Intrinsic: '_mm512_fmadd_epi32'. Requires KNCNI.

func M512FmaddPd ¶

func M512FmaddPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512FmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_fmadd_pd'. Requires KNCNI.

func M512FmaddPs ¶

func M512FmaddPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512FmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_fmadd_ps'. Requires KNCNI.

func M512FmaddRoundPd ¶

func M512FmaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512FmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_fmadd_round_pd'. Requires KNCNI.

func M512FmaddRoundPs ¶

func M512FmaddRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512FmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_fmadd_round_ps'. Requires KNCNI.

func M512FmsubPd ¶

func M512FmsubPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512FmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_fmsub_pd'. Requires KNCNI.

func M512FmsubPs ¶

func M512FmsubPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512FmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_fmsub_ps'. Requires KNCNI.

func M512FmsubRoundPd ¶

func M512FmsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512FmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_fmsub_round_pd'. Requires KNCNI.

func M512FmsubRoundPs ¶

func M512FmsubRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512FmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_fmsub_round_ps'. Requires KNCNI.

func M512FnmaddPd ¶

func M512FnmaddPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512FnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_fnmadd_pd'. Requires KNCNI.

func M512FnmaddPs ¶

func M512FnmaddPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512FnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_fnmadd_ps'. Requires KNCNI.

func M512FnmaddRoundPd ¶

func M512FnmaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512FnmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_fnmadd_round_pd'. Requires KNCNI.

func M512FnmaddRoundPs ¶

func M512FnmaddRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512FnmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_fnmadd_round_ps'. Requires KNCNI.

func M512FnmsubPd ¶

func M512FnmsubPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512FnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_fnmsub_pd'. Requires KNCNI.

func M512FnmsubPs ¶

func M512FnmsubPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512FnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_fnmsub_ps'. Requires KNCNI.

func M512FnmsubRoundPd ¶

func M512FnmsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512FnmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_fnmsub_round_pd'. Requires KNCNI.

func M512FnmsubRoundPs ¶

func M512FnmsubRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512FnmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_fnmsub_round_ps'. Requires KNCNI.

func M512GetexpPd ¶

func M512GetexpPd(a x86.M512d) (dst x86.M512d)

M512GetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm512_getexp_pd'. Requires KNCNI.

func M512GetexpPs ¶

func M512GetexpPs(a x86.M512) (dst x86.M512)

M512GetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm512_getexp_ps'. Requires KNCNI.

func M512GetexpRoundPd ¶

func M512GetexpRoundPd(a x86.M512d, rounding int) (dst x86.M512d)

M512GetexpRoundPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := ConvertExpFP64(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm512_getexp_round_pd'. Requires KNCNI.

func M512GetexpRoundPs ¶

func M512GetexpRoundPs(a x86.M512, rounding int) (dst x86.M512)

M512GetexpRoundPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := ConvertExpFP32(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm512_getexp_round_ps'. Requires KNCNI.

func M512GetmantPd ¶

func M512GetmantPd(a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512d)

M512GetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm512_getmant_pd'. Requires KNCNI.

func M512GetmantPs ¶

func M512GetmantPs(a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512)

M512GetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm512_getmant_ps'. Requires KNCNI.

func M512GetmantRoundPd ¶

func M512GetmantRoundPd(a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512d)

M512GetmantRoundPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm512_getmant_round_pd'. Requires KNCNI.

func M512GetmantRoundPs ¶

func M512GetmantRoundPs(a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512)

M512GetmantRoundPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm512_getmant_round_ps'. Requires KNCNI.

func M512GmaxPd ¶

func M512GmaxPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512GmaxPd: Determines the maximum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in 'a' and 'b', storing the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMAXPD'. Intrinsic: '_mm512_gmax_pd'. Requires KNCNI.

func M512GmaxPs ¶

func M512GmaxPs(a x86.M512, b x86.M512) (dst x86.M512)

M512GmaxPs: Determines the maximum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in 'a' and 'b', storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMAXPS'. Intrinsic: '_mm512_gmax_ps'. Requires KNCNI.

func M512GmaxabsPs ¶

func M512GmaxabsPs(a x86.M512, b x86.M512) (dst x86.M512)

M512GmaxabsPs: Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in 'a' and 'b', storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMAXABSPS'. Intrinsic: '_mm512_gmaxabs_ps'. Requires KNCNI.

func M512GminPd ¶

func M512GminPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512GminPd: Determines the minimum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in 'a' and 'b', storing the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMINPD'. Intrinsic: '_mm512_gmin_pd'. Requires KNCNI.

func M512GminPs ¶

func M512GminPs(a x86.M512, b x86.M512) (dst x86.M512)

M512GminPs: Determines the minimum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in 'a' and 'b', storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMINPS'. Intrinsic: '_mm512_gmin_ps'. Requires KNCNI.

func M512Int2mask ¶

func M512Int2mask(mask int) (dst x86.Mmask16)

M512Int2mask: Converts integer 'mask' into bitmask, storing the result in 'dst'.

dst := mask[15:0]

Instruction: 'KMOV'. Intrinsic: '_mm512_int2mask'. Requires KNCNI.

func M512Kand ¶

func M512Kand(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kand: Compute the bitwise AND of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := a[15:0] AND b[15:0]
k[MAX:16] := 0

Instruction: 'KAND'. Intrinsic: '_mm512_kand'. Requires KNCNI.

func M512Kandn ¶

func M512Kandn(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kandn: Compute the bitwise AND NOT of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := (NOT a[15:0]) AND b[15:0]
k[MAX:16] := 0

Instruction: 'KANDN'. Intrinsic: '_mm512_kandn'. Requires KNCNI.

func M512Kandnr ¶

func M512Kandnr(k1 x86.Mmask16, k2 x86.Mmask16) (dst x86.Mmask16)

M512Kandnr: Performs a bitwise AND operation between NOT of 'k2' and 'k1', storing the result in 'dst'.

dst[15:0] := NOT(k2[15:0]) & k1[15:0]

Instruction: 'KANDNR'. Intrinsic: '_mm512_kandnr'. Requires KNCNI.

func M512Kconcathi64 ¶

func M512Kconcathi64(k1 x86.Mmask16, k2 x86.Mmask16) int64

M512Kconcathi64: Packs masks 'k1' and 'k2' into the high 32 bits of 'dst'. The rest of 'dst' is set to 0.

dst[63:48] := k1[15:0]
dst[47:32] := k2[15:0]
dst[31:0]  := 0

Instruction: 'KCONCATH'. Intrinsic: '_mm512_kconcathi_64'. Requires KNCNI.

func M512Kconcatlo64 ¶

func M512Kconcatlo64(k1 x86.Mmask16, k2 x86.Mmask16) int64

M512Kconcatlo64: Packs masks 'k1' and 'k2' into the low 32 bits of 'dst'. The rest of 'dst' is set to 0.

dst[31:16] := k1[15:0]
dst[15:0]  := k2[15:0]
dst[63:32] := 0

Instruction: 'KCONCATL'. Intrinsic: '_mm512_kconcatlo_64'. Requires KNCNI.

func M512Kextract64 ¶

func M512Kextract64(a int64, b int) (dst x86.Mmask16)

M512Kextract64: Extracts 16-bit value 'b' from 64-bit integer 'a', storing the result in 'dst'.

CASE b of
0: dst[15:0] := a[63:48]
1: dst[15:0] := a[47:32]
2: dst[15:0] := a[31:16]
3: dst[15:0] := a[15:0]
ESAC
dst[MAX:15] := 0

Instruction: 'KEXTRACT'. Intrinsic: '_mm512_kextract_64'. Requires KNCNI.

func M512Kmerge2l1h ¶

func M512Kmerge2l1h(k1 x86.Mmask16, k2 x86.Mmask16) (dst x86.Mmask16)

M512Kmerge2l1h: Move the high element from 'k1' to the low element of 'k1', and insert the low element of 'k2' into the high element of 'k1'.

tmp[7:0] := k1[15:8]
k1[15:8] := k2[7:0]
k1[7:0]  := tmp[7:0]

Instruction: 'KMERGE2L1H'. Intrinsic: '_mm512_kmerge2l1h'. Requires KNCNI.

func M512Kmerge2l1l ¶

func M512Kmerge2l1l(k1 x86.Mmask16, k2 x86.Mmask16) (dst x86.Mmask16)

M512Kmerge2l1l: Insert the low element of 'k2' into the high element of 'k1'.

k1[15:8] := k2[7:0]

Instruction: 'KMERGE2L1L'. Intrinsic: '_mm512_kmerge2l1l'. Requires KNCNI.

func M512Kmov ¶

func M512Kmov(a x86.Mmask16) (dst x86.Mmask16)

M512Kmov: Copy 16-bit mask 'a' to 'k'.

k[15:0] := a[15:0]
k[MAX:16] := 0

Instruction: 'KMOV'. Intrinsic: '_mm512_kmov'. Requires KNCNI.

func M512Kmovlhb ¶

func M512Kmovlhb(k1 x86.Mmask16, k2 x86.Mmask16) (dst x86.Mmask16)

M512Kmovlhb: Inserts the low byte of mask 'k2' into the high byte of 'dst', and copies the low byte of 'k1' to the low byte of 'dst'.

dst[7:0] := k1[7:0]
dst[15:8] := k2[7:0]

Instruction: 'KMERGE2L1L'. Intrinsic: '_mm512_kmovlhb'. Requires KNCNI.

func M512Knot ¶

func M512Knot(a x86.Mmask16) (dst x86.Mmask16)

M512Knot: Compute the bitwise NOT of 16-bit mask 'a', and store the result in 'k'.

k[15:0] := NOT a[15:0]
k[MAX:16] := 0

Instruction: 'KNOT'. Intrinsic: '_mm512_knot'. Requires KNCNI.

func M512Kor ¶

func M512Kor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kor: Compute the bitwise OR of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := a[15:0] OR b[15:0]
k[MAX:16] := 0

Instruction: 'KOR'. Intrinsic: '_mm512_kor'. Requires KNCNI.

func M512Kortestc ¶

func M512Kortestc(k1 x86.Mmask16, k2 x86.Mmask16) int

M512Kortestc: Performs bitwise OR between 'k1' and 'k2', storing the result in 'dst'. CF flag is set if 'dst' consists of all 1's.

dst[15:0] := k1[15:0] | k2[15:0]
IF PopCount(dst[15:0]) = 16
	SetCF()
FI

Instruction: 'KORTEST'. Intrinsic: '_mm512_kortestc'. Requires KNCNI.

func M512Kortestz ¶

func M512Kortestz(k1 x86.Mmask16, k2 x86.Mmask16) int

M512Kortestz: Performs bitwise OR between 'k1' and 'k2', storing the result in 'dst'. ZF flag is set if 'dst' is 0.

dst[15:0] := k1[15:0] | k2[15:0]
IF dst = 0
	SetZF()
FI

Instruction: 'KORTEST'. Intrinsic: '_mm512_kortestz'. Requires KNCNI.

func M512Kswapb ¶

func M512Kswapb(k1 x86.Mmask16, k2 x86.Mmask16) (dst x86.Mmask16)

M512Kswapb: Moves high byte from 'k2' to low byte of 'k1', and moves low byte of 'k2' to high byte of 'k1'.

tmp[7:0] := k2[15:8]
k2[15:8] := k1[7:0]
k1[7:0]  := tmp[7:0]

tmp[7:0] := k2[7:0]
k2[7:0]  := k1[15:8]
k1[15:8] := tmp[7:0]

Instruction: 'KMERGE2L1H'. Intrinsic: '_mm512_kswapb'. Requires KNCNI.

func M512Kxnor ¶

func M512Kxnor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kxnor: Compute the bitwise XNOR of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := NOT (a[15:0] XOR b[15:0])
k[MAX:16] := 0

Instruction: 'KXNOR'. Intrinsic: '_mm512_kxnor'. Requires KNCNI.

func M512Kxor ¶

func M512Kxor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kxor: Compute the bitwise XOR of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := a[15:0] XOR b[15:0]
k[MAX:16] := 0

Instruction: 'KXOR'. Intrinsic: '_mm512_kxor'. Requires KNCNI.

func M512Log2Ps ¶

func M512Log2Ps(a x86.M512) (dst x86.M512)

M512Log2Ps: Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := log2(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VLOG2PS'. Intrinsic: '_mm512_log2_ps'. Requires KNCNI.

func M512Log2ae23Ps ¶

func M512Log2ae23Ps(a x86.M512) (dst x86.M512)

M512Log2ae23Ps: Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in 'a' with absolute error of 2^(-23) and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := Log2ae23(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VLOG2PS'. Intrinsic: '_mm512_log2ae23_ps'. Requires KNCNI.

func M512Mask2int ¶

func M512Mask2int(k1 x86.Mmask16) int

M512Mask2int: Converts bit mask 'k1' into an integer value, storing the results in 'dst'.

dst := SignExtend(k1)

Instruction: 'KMOV'. Intrinsic: '_mm512_mask2int'. Requires KNCNI.

func M512Mask3FmaddEpi32 ¶

func M512Mask3FmaddEpi32(a x86.M512i, b x86.M512i, c x86.M512i, k x86.Mmask16) (dst x86.M512i)

M512Mask3FmaddEpi32: Multiply packed 32-bit integer elements in 'a' and 'b', add the intermediate result to packed elements in 'c' and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADD231D'. Intrinsic: '_mm512_mask3_fmadd_epi32'. Requires KNCNI.

func M512Mask3FmaddPd ¶

func M512Mask3FmaddPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)

M512Mask3FmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_mask3_fmadd_pd'. Requires KNCNI.

func M512Mask3FmaddPs ¶

func M512Mask3FmaddPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)

M512Mask3FmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_mask3_fmadd_ps'. Requires KNCNI.

func M512Mask3FmaddRoundPd ¶

func M512Mask3FmaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)

M512Mask3FmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
			ELSE
				dst[i+63:i] := c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_mask3_fmadd_round_pd'. Requires KNCNI.

func M512Mask3FmaddRoundPs ¶

func M512Mask3FmaddRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)

M512Mask3FmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
			ELSE
				dst[i+31:i] := c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_mask3_fmadd_round_ps'. Requires KNCNI.

func M512Mask3FmsubPd ¶

func M512Mask3FmsubPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)

M512Mask3FmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_mask3_fmsub_pd'. Requires KNCNI.

func M512Mask3FmsubPs ¶

func M512Mask3FmsubPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)

M512Mask3FmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_mask3_fmsub_ps'. Requires KNCNI.

func M512Mask3FmsubRoundPd ¶

func M512Mask3FmsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)

M512Mask3FmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
			ELSE
				dst[i+63:i] := c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_mask3_fmsub_round_pd'. Requires KNCNI.

func M512Mask3FmsubRoundPs ¶

func M512Mask3FmsubRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)

M512Mask3FmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
			ELSE
				dst[i+31:i] := c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_mask3_fmsub_round_ps'. Requires KNCNI.

func M512Mask3FnmaddPd ¶

func M512Mask3FnmaddPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)

M512Mask3FnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_mask3_fnmadd_pd'. Requires KNCNI.

func M512Mask3FnmaddPs ¶

func M512Mask3FnmaddPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)

M512Mask3FnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_mask3_fnmadd_ps'. Requires KNCNI.

func M512Mask3FnmaddRoundPd ¶

func M512Mask3FnmaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)

M512Mask3FnmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
			ELSE
				dst[i+63:i] := c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_mask3_fnmadd_round_pd'. Requires KNCNI.

func M512Mask3FnmaddRoundPs ¶

func M512Mask3FnmaddRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)

M512Mask3FnmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
			ELSE
				dst[i+31:i] := c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_mask3_fnmadd_round_ps'. Requires KNCNI.

func M512Mask3FnmsubPd ¶

func M512Mask3FnmsubPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)

M512Mask3FnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_mask3_fnmsub_pd'. Requires KNCNI.

func M512Mask3FnmsubPs ¶

func M512Mask3FnmsubPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)

M512Mask3FnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_mask3_fnmsub_ps'. Requires KNCNI.

func M512Mask3FnmsubRoundPd ¶

func M512Mask3FnmsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)

M512Mask3FnmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
			ELSE
				dst[i+63:i] := c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_mask3_fnmsub_round_pd'. Requires KNCNI.

func M512Mask3FnmsubRoundPs ¶

func M512Mask3FnmsubRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)

M512Mask3FnmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
			ELSE
				dst[i+31:i] := c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_mask3_fnmsub_round_ps'. Requires KNCNI.

func M512MaskAbsPd ¶

func M512MaskAbsPd(src x86.M512d, k x86.Mmask8, v2 x86.M512d) (dst x86.M512d)

M512MaskAbsPd: Finds the absolute value of each packed double-precision (64-bit) floating-point element in 'v2', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(v2[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm512_mask_abs_pd'. Requires KNCNI.

func M512MaskAbsPs ¶

func M512MaskAbsPs(src x86.M512, k x86.Mmask16, v2 x86.M512) (dst x86.M512)

M512MaskAbsPs: Finds the absolute value of each packed single-precision (32-bit) floating-point element in 'v2', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(v2[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm512_mask_abs_ps'. Requires KNCNI.

func M512MaskAdcEpi32 ¶

func M512MaskAdcEpi32(v2 x86.M512i, k1 x86.Mmask16, k2 x86.Mmask16, v3 x86.M512i, k2_res *x86.Mmask16) (dst x86.M512i)

M512MaskAdcEpi32: Performs element-by-element addition of packed 32-bit integers in 'v2' and 'v3' and the corresponding bit in 'k2', storing the result of the addition in 'dst' and the result of the carry in 'k2_res' using writemask 'k1' (elements are copied from 'v2' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k2_res[j]   := Carry(v2[i+31:i] + v3[i+31:i] + k2[j])
		dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j]
	ELSE
		dst[i+31:i] := v2[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADCD'. Intrinsic: '_mm512_mask_adc_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskAddEpi32 ¶

func M512MaskAddEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm512_mask_add_epi32'. Requires KNCNI.

func M512MaskAddPd ¶

func M512MaskAddPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskAddPd: Add packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDPD'. Intrinsic: '_mm512_mask_add_pd'. Requires KNCNI.

func M512MaskAddPs ¶

func M512MaskAddPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskAddPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDPS'. Intrinsic: '_mm512_mask_add_ps'. Requires KNCNI.

func M512MaskAddRoundPd ¶

func M512MaskAddRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskAddRoundPd: Add packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := a[i+63:i] + b[i+63:i]
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDPD'. Intrinsic: '_mm512_mask_add_round_pd'. Requires KNCNI.

func M512MaskAddRoundPs ¶

func M512MaskAddRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskAddRoundPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := a[i+31:i] + b[i+31:i]
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDPS'. Intrinsic: '_mm512_mask_add_round_ps'. Requires KNCNI.

func M512MaskAddnPd ¶

func M512MaskAddnPd(src x86.M512d, k x86.Mmask8, v2 x86.M512d, v3 x86.M512d) (dst x86.M512d)

M512MaskAddnPd: Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in 'v2' and 'v3' and negates their sum, storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDNPD'. Intrinsic: '_mm512_mask_addn_pd'. Requires KNCNI.

func M512MaskAddnPs ¶

func M512MaskAddnPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512) (dst x86.M512)

M512MaskAddnPs: Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in 'v2' and 'v3' and negates their sum, storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDNPS'. Intrinsic: '_mm512_mask_addn_ps'. Requires KNCNI.

func M512MaskAddnRoundPd ¶

func M512MaskAddnRoundPd(src x86.M512d, k x86.Mmask8, v2 x86.M512d, v3 x86.M512d, rounding int) (dst x86.M512d)

M512MaskAddnRoundPd: Performs element by element addition between packed double-precision (64-bit) floating-point elements in 'v2' and 'v3' and negates the sum, storing the result in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDNPD'. Intrinsic: '_mm512_mask_addn_round_pd'. Requires KNCNI.

func M512MaskAddnRoundPs ¶

func M512MaskAddnRoundPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512, rounding int) (dst x86.M512)

M512MaskAddnRoundPs: Performs element by element addition between packed single-precision (32-bit) floating-point elements in 'v2' and 'v3' and negates the sum, storing the result in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDNPS'. Intrinsic: '_mm512_mask_addn_round_ps'. Requires KNCNI.

func M512MaskAddsetcEpi32 ¶

func M512MaskAddsetcEpi32(v2 x86.M512i, k x86.Mmask16, k_old x86.Mmask16, v3 x86.M512i, k2_res *x86.Mmask16) (dst x86.M512i)

M512MaskAddsetcEpi32: Performs element-by-element addition of packed 32-bit integer elements in 'v2' and 'v3', storing the resultant carry in 'k2_res' (carry flag) and the addition results in 'dst' using writemask 'k' (elements are copied from 'v2' and 'k_old' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
	ELSE
		dst[i+31:i] := v2[i+31:i]
		k2_res[j] := k_old[j]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSETCD'. Intrinsic: '_mm512_mask_addsetc_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskAddsetsEpi32 ¶

func M512MaskAddsetsEpi32(src x86.M512i, k x86.Mmask16, v2 x86.M512i, v3 x86.M512i, sign *x86.Mmask16) (dst x86.M512i)

M512MaskAddsetsEpi32: Performs an element-by-element addition of packed 32-bit integer elements in 'v2' and 'v3', storing the results in 'dst' and the sign of the sum in 'sign' (sign flag). Results are stored using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
		sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSETSD'. Intrinsic: '_mm512_mask_addsets_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskAddsetsPs ¶

func M512MaskAddsetsPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512, sign *x86.Mmask16) (dst x86.M512)

M512MaskAddsetsPs: Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in 'v2' and 'v3', storing the results in 'dst' and the sign of the sum in 'sign' (sign flag). Results are stored using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
		sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDSETSPS'. Intrinsic: '_mm512_mask_addsets_ps'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskAddsetsRoundPs ¶

func M512MaskAddsetsRoundPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512, sign *x86.Mmask16, rounding int) (dst x86.M512)

M512MaskAddsetsRoundPs: Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in 'v2' and 'v3', storing the results in 'dst' and the sign of the sum in 'sign' (sign flag). Results are stored using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
				sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDSETSPS'. Intrinsic: '_mm512_mask_addsets_round_ps'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskAlignrEpi32 ¶

func M512MaskAlignrEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskAlignrEpi32: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 32-bit elements, and store the low 64 bytes (16 elements) in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (32*count)
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := temp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VALIGND'. Intrinsic: '_mm512_mask_alignr_epi32'. Requires KNCNI.

func M512MaskAndEpi32 ¶

func M512MaskAndEpi32(src x86.M512i, k x86.Mmask16, v2 x86.M512i, v3 x86.M512i) (dst x86.M512i)

M512MaskAndEpi32: Performs element-by-element bitwise AND between packed 32-bit integer elements of 'v2' and 'v3', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := v2[i+31:i] & v3[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm512_mask_and_epi32'. Requires KNCNI.

func M512MaskAndEpi64 ¶

func M512MaskAndEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm512_mask_and_epi64'. Requires KNCNI.

func M512MaskAndnotEpi32 ¶

func M512MaskAndnotEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm512_mask_andnot_epi32'. Requires KNCNI.

func M512MaskAndnotEpi64 ¶

func M512MaskAndnotEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm512_mask_andnot_epi64'. Requires KNCNI.

func M512MaskBlendEpi32 ¶

func M512MaskBlendEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskBlendEpi32: Blend packed 32-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBLENDMD'. Intrinsic: '_mm512_mask_blend_epi32'. Requires KNCNI.

func M512MaskBlendEpi64 ¶

func M512MaskBlendEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskBlendEpi64: Blend packed 64-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBLENDMQ'. Intrinsic: '_mm512_mask_blend_epi64'. Requires KNCNI.

func M512MaskBlendPd ¶

func M512MaskBlendPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskBlendPd: Blend packed double-precision (64-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBLENDMPD'. Intrinsic: '_mm512_mask_blend_pd'. Requires KNCNI.

func M512MaskBlendPs ¶

func M512MaskBlendPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskBlendPs: Blend packed single-precision (32-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBLENDMPS'. Intrinsic: '_mm512_mask_blend_ps'. Requires KNCNI.

func M512MaskCmpEpi32Mask ¶

func M512MaskCmpEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask16)

M512MaskCmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_mask_cmp_epi32_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpEpu32Mask ¶

func M512MaskCmpEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask16)

M512MaskCmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_mask_cmp_epu32_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpPdMask ¶

func M512MaskCmpPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.Mmask8)

M512MaskCmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_mask_cmp_pd_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpPsMask ¶

func M512MaskCmpPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.Mmask16)

M512MaskCmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_mask_cmp_ps_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpRoundPdMask ¶

func M512MaskCmpRoundPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte, sae int) (dst x86.Mmask8)

M512MaskCmpRoundPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC
	FOR j := 0 to 7
		i := j*64
		IF k1[j]
			k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
		ELSE
			k[j] := 0
		FI
	ENDFOR
	k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_mask_cmp_round_pd_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpRoundPsMask ¶

func M512MaskCmpRoundPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512, imm8 byte, sae int) (dst x86.Mmask16)

M512MaskCmpRoundPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC
	FOR j := 0 to 15
		i := j*32
		IF k1[j]
			k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
		ELSE
			k[j] := 0
		FI
	ENDFOR
	k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_mask_cmp_round_ps_mask'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpeqEpi32Mask ¶

func M512MaskCmpeqEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPEQD'. Intrinsic: '_mm512_mask_cmpeq_epi32_mask'. Requires KNCNI.

func M512MaskCmpeqEpu32Mask ¶

func M512MaskCmpeqEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_mask_cmpeq_epu32_mask'. Requires KNCNI.

func M512MaskCmpeqPdMask ¶

func M512MaskCmpeqPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512MaskCmpeqPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_mask_cmpeq_pd_mask'. Requires KNCNI.

func M512MaskCmpeqPsMask ¶

func M512MaskCmpeqPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)

M512MaskCmpeqPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

y
FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_mask_cmpeq_ps_mask'. Requires KNCNI.

func M512MaskCmpgeEpi32Mask ¶

func M512MaskCmpgeEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_mask_cmpge_epi32_mask'. Requires KNCNI.

func M512MaskCmpgeEpu32Mask ¶

func M512MaskCmpgeEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_mask_cmpge_epu32_mask'. Requires KNCNI.

func M512MaskCmpgtEpi32Mask ¶

func M512MaskCmpgtEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPGTD'. Intrinsic: '_mm512_mask_cmpgt_epi32_mask'. Requires KNCNI.

func M512MaskCmpgtEpu32Mask ¶

func M512MaskCmpgtEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_mask_cmpgt_epu32_mask'. Requires KNCNI.

func M512MaskCmpleEpi32Mask ¶

func M512MaskCmpleEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_mask_cmple_epi32_mask'. Requires KNCNI.

func M512MaskCmpleEpu32Mask ¶

func M512MaskCmpleEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_mask_cmple_epu32_mask'. Requires KNCNI.

func M512MaskCmplePdMask ¶

func M512MaskCmplePdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512MaskCmplePdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_mask_cmple_pd_mask'. Requires KNCNI.

func M512MaskCmplePsMask ¶

func M512MaskCmplePsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)

M512MaskCmplePsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_mask_cmple_ps_mask'. Requires KNCNI.

func M512MaskCmpltEpi32Mask ¶

func M512MaskCmpltEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPLTD'. Intrinsic: '_mm512_mask_cmplt_epi32_mask'. Requires KNCNI.

func M512MaskCmpltEpu32Mask ¶

func M512MaskCmpltEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_mask_cmplt_epu32_mask'. Requires KNCNI.

func M512MaskCmpltPdMask ¶

func M512MaskCmpltPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512MaskCmpltPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_mask_cmplt_pd_mask'. Requires KNCNI.

func M512MaskCmpltPsMask ¶

func M512MaskCmpltPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)

M512MaskCmpltPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_mask_cmplt_ps_mask'. Requires KNCNI.

func M512MaskCmpneqEpi32Mask ¶

func M512MaskCmpneqEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_mask_cmpneq_epi32_mask'. Requires KNCNI.

func M512MaskCmpneqEpu32Mask ¶

func M512MaskCmpneqEpu32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm512_mask_cmpneq_epu32_mask'. Requires KNCNI.

func M512MaskCmpneqPdMask ¶

func M512MaskCmpneqPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512MaskCmpneqPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_mask_cmpneq_pd_mask'. Requires KNCNI.

func M512MaskCmpneqPsMask ¶

func M512MaskCmpneqPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)

M512MaskCmpneqPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_mask_cmpneq_ps_mask'. Requires KNCNI.

func M512MaskCmpnlePdMask ¶

func M512MaskCmpnlePdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512MaskCmpnlePdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for not-less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := !(a[i+63:i] <= b[i+63:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_mask_cmpnle_pd_mask'. Requires KNCNI.

func M512MaskCmpnlePsMask ¶

func M512MaskCmpnlePsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)

M512MaskCmpnlePsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := !(a[i+31:i] <= b[i+31:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_mask_cmpnle_ps_mask'. Requires KNCNI.

func M512MaskCmpnltPdMask ¶

func M512MaskCmpnltPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512MaskCmpnltPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' for not-less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := !(a[i+63:i] < b[i+63:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_mask_cmpnlt_pd_mask'. Requires KNCNI.

func M512MaskCmpnltPsMask ¶

func M512MaskCmpnltPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)

M512MaskCmpnltPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := !(a[i+31:i] < b[i+31:i]) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_mask_cmpnlt_ps_mask'. Requires KNCNI.

func M512MaskCmpordPdMask ¶

func M512MaskCmpordPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512MaskCmpordPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' to see if neither is NaN, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_mask_cmpord_pd_mask'. Requires KNCNI.

func M512MaskCmpordPsMask ¶

func M512MaskCmpordPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)

M512MaskCmpordPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' to see if neither is NaN, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := (a[i+31:i] != NaN AND b[i+31:i] != NaN) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_mask_cmpord_ps_mask'. Requires KNCNI.

func M512MaskCmpunordPdMask ¶

func M512MaskCmpunordPdMask(k1 x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.Mmask8)

M512MaskCmpunordPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' to see if either is NaN, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm512_mask_cmpunord_pd_mask'. Requires KNCNI.

func M512MaskCmpunordPsMask ¶

func M512MaskCmpunordPsMask(k1 x86.Mmask16, a x86.M512, b x86.M512) (dst x86.Mmask16)

M512MaskCmpunordPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' to see if either is NaN, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := (a[i+31:i] == NaN OR b[i+31:i] == NaN) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm512_mask_cmpunord_ps_mask'. Requires KNCNI.

func M512MaskCvtRoundpdPslo ¶

func M512MaskCvtRoundpdPslo(src x86.M512, k x86.Mmask8, v2 x86.M512d, rounding int) (dst x86.M512)

M512MaskCvtRoundpdPslo: Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in 'v2' to packed single-precision (32-bit) floating-point elements, storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). Results are written to the lower half of 'dst', and the upper half locations are set to '0'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := Float64ToFloat32(v2[i+63:i])
			ELSE
				dst[l+31:l] := src[l+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_mask_cvt_roundpd_pslo'. Requires KNCNI.

func M512MaskCvtepi32loPd ¶

func M512MaskCvtepi32loPd(src x86.M512d, k x86.Mmask8, v2 x86.M512i) (dst x86.M512d)

M512MaskCvtepi32loPd: Performs element-by-element conversion of the lower half of packed 32-bit integer elements in 'v2' to packed double-precision (64-bit) floating-point elements, storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := j*64
	IF k[j]
		dst[k+63:k] := Int32ToFloat64(v2[i+31:i])
	ELSE
		dst[n+63:n] := src[n+63:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_mask_cvtepi32lo_pd'. Requires KNCNI.

func M512MaskCvtepu32loPd ¶

func M512MaskCvtepu32loPd(src x86.M512d, k x86.Mmask8, v2 x86.M512i) (dst x86.M512d)

M512MaskCvtepu32loPd: Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in 'v2' to packed double-precision (64-bit) floating-point elements, storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	l := j*64
	IF k[j]
		dst[l+63:l] := UInt32ToFloat64(v2[i+31:i])
	ELSE
		dst[l+63:l] := src[l+63:l]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_mask_cvtepu32lo_pd'. Requires KNCNI.

func M512MaskCvtfxpntRoundAdjustepu32Ps ¶

func M512MaskCvtfxpntRoundAdjustepu32Ps(src x86.M512, k x86.Mmask16, v2 x86.M512i, rounding int, expadj MMEXPADJENUM) (dst x86.M512)

M512MaskCvtfxpntRoundAdjustepu32Ps: Performs element-by-element conversion of packed 32-bit unsigned integer elements in 'v2' to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using 'expadj', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := Int32ToFloat32(v2[i+31:i])
				CASE expadj OF
				_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
				_MM_EXPADJ_4:	 dst[i+31:i] = dst[i+31:i] * 2**4
				_MM_EXPADJ_5:	 dst[i+31:i] = dst[i+31:i] * 2**5
				_MM_EXPADJ_8:	 dst[i+31:i] = dst[i+31:i] * 2**8
				_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
				_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
				_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
				_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
				ESAC
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTFXPNTUDQ2PS'. Intrinsic: '_mm512_mask_cvtfxpnt_round_adjustepu32_ps'. Requires KNCNI.

func M512MaskCvtfxpntRoundpdEpi32lo ¶

func M512MaskCvtfxpntRoundpdEpi32lo(src x86.M512i, k x86.Mmask8, v2 x86.M512d, rounding int) (dst x86.M512i)

M512MaskCvtfxpntRoundpdEpi32lo: Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector 'v2' to 32-bit integer elements, storing them in the lower half of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The elements in the upper half of 'dst' are set to 0.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := Float64ToInt32(v2[i+63:i])
			ELSE
				dst[l+31:l] := src[l+31:l]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTFXPNTPD2DQ'. Intrinsic: '_mm512_mask_cvtfxpnt_roundpd_epi32lo'. Requires KNCNI.

func M512MaskCvtfxpntRoundpdEpu32lo ¶

func M512MaskCvtfxpntRoundpdEpu32lo(src x86.M512i, k x86.Mmask8, v2 x86.M512d, rounding int) (dst x86.M512i)

M512MaskCvtfxpntRoundpdEpu32lo: Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in 'v2' to packed 32-bit unsigned integer elements, storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). Results are written to the lower half of 'dst', and the upper half locations are set to '0'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := Float64ToInt32(v2[i+63:i])
			ELSE
				dst[l+31:l] := src[l+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTFXPNTPD2UDQ'. Intrinsic: '_mm512_mask_cvtfxpnt_roundpd_epu32lo'. Requires KNCNI.

func M512MaskCvtpdPslo ¶

func M512MaskCvtpdPslo(src x86.M512, k x86.Mmask8, v2 x86.M512d) (dst x86.M512)

M512MaskCvtpdPslo: Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in 'v2' to single-precision (32-bit) floating-point elements and stores them in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Float64ToFloat32(v2[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_mask_cvtpd_pslo'. Requires KNCNI.

func M512MaskCvtpsloPd ¶

func M512MaskCvtpsloPd(src x86.M512d, k x86.Mmask8, v2 x86.M512) (dst x86.M512d)

M512MaskCvtpsloPd: Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in 'v2' to packed double-precision (64-bit) floating-point elements, storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	l := j*64
	IF k[j]
		dst[l+63:l] := Float32ToFloat64(v2[i+31:i])
	ELSE
		dst[l+63:l] := src[l+63:l]:
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_mask_cvtpslo_pd'. Requires KNCNI.

func M512MaskExp223Ps ¶

func M512MaskExp223Ps(src x86.M512, k x86.Mmask16, v2 x86.M512i) (dst x86.M512)

M512MaskExp223Ps: Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in 'v2' with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := exp223(v2[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VEXP223PS'. Intrinsic: '_mm512_mask_exp223_ps'. Requires KNCNI.

func M512MaskFixupnanPd ¶

func M512MaskFixupnanPd(v1 x86.M512d, k x86.Mmask8, v2 x86.M512d, v3 x86.M512i) (dst x86.M512d)

M512MaskFixupnanPd: Fixes up NaN's from packed double-precision (64-bit) floating-point elements in 'v1' and 'v2', storing the results in 'dst' using writemask 'k' (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from 'v1' are stored in 'v3'.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i])
		v3[i+63:i] := QuietizeNaNs(v1[i+63:i])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPNANPD'. Intrinsic: '_mm512_mask_fixupnan_pd'. Requires KNCNI.

func M512MaskFixupnanPs ¶

func M512MaskFixupnanPs(v1 x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512i) (dst x86.M512)

M512MaskFixupnanPs: Fixes up NaN's from packed single-precision (32-bit) floating-point elements in 'v1' and 'v2', storing the results in 'dst' using writemask 'k' (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from 'v1' are stored in 'v3'.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i])
		v3[i+31:i] := QuietizeNaNs(v1[i+31:i])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPNANPS'. Intrinsic: '_mm512_mask_fixupnan_ps'. Requires KNCNI.

func M512MaskFmadd233Epi32 ¶

func M512MaskFmadd233Epi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskFmadd233Epi32: Multiply packed 32-bit integer elements in each 4-element set of 'a' and by element 1 of the corresponding 4-element set from 'b', add the intermediate result to element 0 of the corresponding 4-element set from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		base := (j & ~0x3) * 32
		scale[31:0] := b[base+63:base+32]
		bias[31:0]  := b[base+31:base]
		dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADD233D'. Intrinsic: '_mm512_mask_fmadd233_epi32'. Requires KNCNI.

func M512MaskFmadd233Ps ¶

func M512MaskFmadd233Ps(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskFmadd233Ps: Performs multiplication between single-precision (32-bit) floating-point elements in 'a' and 'b' and adds the result to the elements in 'b', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD233PS'. Intrinsic: '_mm512_mask_fmadd233_ps'. Requires KNCNI.

func M512MaskFmadd233RoundPs ¶

func M512MaskFmadd233RoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskFmadd233RoundPs: Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of 'a' and by element 1 of the corresponding 4-element set from 'b', add the intermediate result to element 0 of the corresponding 4-element set from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				base := (j & ~0x3) * 32
				scale[31:0] := b[base+63:base+32]
				bias[31:0]  := b[base+31:base]
				dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD233PS'. Intrinsic: '_mm512_mask_fmadd233_round_ps'. Requires KNCNI.

func M512MaskFmaddEpi32 ¶

func M512MaskFmaddEpi32(a x86.M512i, k x86.Mmask16, b x86.M512i, c x86.M512i) (dst x86.M512i)

M512MaskFmaddEpi32: Multiply packed 32-bit integer elements in 'a' and 'b', add the intermediate result to packed elements in 'c' and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADD231D'. Intrinsic: '_mm512_mask_fmadd_epi32'. Requires KNCNI.

func M512MaskFmaddPd ¶

func M512MaskFmaddPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_mask_fmadd_pd'. Requires KNCNI.

func M512MaskFmaddPs ¶

func M512MaskFmaddPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_mask_fmadd_ps'. Requires KNCNI.

func M512MaskFmaddRoundPd ¶

func M512MaskFmaddRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskFmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
			ELSE
				dst[i+63:i] := a[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_mask_fmadd_round_pd'. Requires KNCNI.

func M512MaskFmaddRoundPs ¶

func M512MaskFmaddRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskFmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
			ELSE
				dst[i+31:i] := a[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_mask_fmadd_round_ps'. Requires KNCNI.

func M512MaskFmsubPd ¶

func M512MaskFmsubPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_mask_fmsub_pd'. Requires KNCNI.

func M512MaskFmsubPs ¶

func M512MaskFmsubPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_mask_fmsub_ps'. Requires KNCNI.

func M512MaskFmsubRoundPd ¶

func M512MaskFmsubRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskFmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
			ELSE
				dst[i+63:i] := a[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_mask_fmsub_round_pd'. Requires KNCNI.

func M512MaskFmsubRoundPs ¶

func M512MaskFmsubRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskFmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
			ELSE
				dst[i+31:i] := a[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_mask_fmsub_round_ps'. Requires KNCNI.

func M512MaskFnmaddPd ¶

func M512MaskFnmaddPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_mask_fnmadd_pd'. Requires KNCNI.

func M512MaskFnmaddPs ¶

func M512MaskFnmaddPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_mask_fnmadd_ps'. Requires KNCNI.

func M512MaskFnmaddRoundPd ¶

func M512MaskFnmaddRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskFnmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
			ELSE
				dst[i+63:i] := a[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_mask_fnmadd_round_pd'. Requires KNCNI.

func M512MaskFnmaddRoundPs ¶

func M512MaskFnmaddRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskFnmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
			ELSE
				dst[i+31:i] := a[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_mask_fnmadd_round_ps'. Requires KNCNI.

func M512MaskFnmsubPd ¶

func M512MaskFnmsubPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_mask_fnmsub_pd'. Requires KNCNI.

func M512MaskFnmsubPs ¶

func M512MaskFnmsubPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_mask_fnmsub_ps'. Requires KNCNI.

func M512MaskFnmsubRoundPd ¶

func M512MaskFnmsubRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskFnmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
			ELSE
				dst[i+63:i] := a[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_mask_fnmsub_round_pd'. Requires KNCNI.

func M512MaskFnmsubRoundPs ¶

func M512MaskFnmsubRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskFnmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
			ELSE
				dst[i+31:i] := a[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_mask_fnmsub_round_ps'. Requires KNCNI.

func M512MaskGetexpPd ¶

func M512MaskGetexpPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm512_mask_getexp_pd'. Requires KNCNI.

func M512MaskGetexpPs ¶

func M512MaskGetexpPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm512_mask_getexp_ps'. Requires KNCNI.

func M512MaskGetexpRoundPd ¶

func M512MaskGetexpRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)

M512MaskGetexpRoundPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ConvertExpFP64(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm512_mask_getexp_round_pd'. Requires KNCNI.

func M512MaskGetexpRoundPs ¶

func M512MaskGetexpRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)

M512MaskGetexpRoundPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ConvertExpFP32(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm512_mask_getexp_round_ps'. Requires KNCNI.

func M512MaskGetmantPd ¶

func M512MaskGetmantPd(src x86.M512d, k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512d)

M512MaskGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm512_mask_getmant_pd'. Requires KNCNI.

func M512MaskGetmantPs ¶

func M512MaskGetmantPs(src x86.M512, k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512)

M512MaskGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm512_mask_getmant_ps'. Requires KNCNI.

func M512MaskGetmantRoundPd ¶

func M512MaskGetmantRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512d)

M512MaskGetmantRoundPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm512_mask_getmant_round_pd'. Requires KNCNI.

func M512MaskGetmantRoundPs ¶

func M512MaskGetmantRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512)

M512MaskGetmantRoundPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm512_mask_getmant_round_ps'. Requires KNCNI.

func M512MaskGmaxPd ¶

func M512MaskGmaxPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskGmaxPd: Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in 'a' and 'b', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMAXPD'. Intrinsic: '_mm512_mask_gmax_pd'. Requires KNCNI.

func M512MaskGmaxPs ¶

func M512MaskGmaxPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskGmaxPs: Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in 'a' and 'b', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMAXPS'. Intrinsic: '_mm512_mask_gmax_ps'. Requires KNCNI.

func M512MaskGmaxabsPs ¶

func M512MaskGmaxabsPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskGmaxabsPs: Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in 'a' and 'b', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMAXABSPS'. Intrinsic: '_mm512_mask_gmaxabs_ps'. Requires KNCNI.

func M512MaskGminPd ¶

func M512MaskGminPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskGminPd: Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in 'a' and 'b', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMINPD'. Intrinsic: '_mm512_mask_gmin_pd'. Requires KNCNI.

func M512MaskGminPs ¶

func M512MaskGminPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskGminPs: Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in 'a' and 'b', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMINPS'. Intrinsic: '_mm512_mask_gmin_ps'. Requires KNCNI.

func M512MaskLog2Ps ¶

func M512MaskLog2Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskLog2Ps: Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := log2(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VLOG2PS'. Intrinsic: '_mm512_mask_log2_ps'. Requires KNCNI.

func M512MaskLog2ae23Ps ¶

func M512MaskLog2ae23Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskLog2ae23Ps: Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in 'a' with absolute error of 2^(-23) and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := Log2ae23(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VLOG2PS'. Intrinsic: '_mm512_mask_log2ae23_ps'. Requires KNCNI.

func M512MaskMaxEpi32 ¶

func M512MaskMaxEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm512_mask_max_epi32'. Requires KNCNI.

func M512MaskMaxEpu32 ¶

func M512MaskMaxEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm512_mask_max_epu32'. Requires KNCNI.

func M512MaskMaxabsPs ¶

func M512MaskMaxabsPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskMaxabsPs: Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in 'a' and 'b', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMAXABSPS'. Intrinsic: '_mm512_mask_maxabs_ps'. Requires KNCNI.

func M512MaskMinEpi32 ¶

func M512MaskMinEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
				dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm512_mask_min_epi32'. Requires KNCNI.

func M512MaskMinEpu32 ¶

func M512MaskMinEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm512_mask_min_epu32'. Requires KNCNI.

func M512MaskMovEpi32 ¶

func M512MaskMovEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskMovEpi32: Move packed 32-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm512_mask_mov_epi32'. Requires KNCNI.

func M512MaskMovEpi64 ¶

func M512MaskMovEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskMovEpi64: Move packed 64-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm512_mask_mov_epi64'. Requires KNCNI.

func M512MaskMovPd ¶

func M512MaskMovPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm512_mask_mov_pd'. Requires KNCNI.

func M512MaskMovPs ¶

func M512MaskMovPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm512_mask_mov_ps'. Requires KNCNI.

func M512MaskMulPd ¶

func M512MaskMulPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). RM.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm512_mask_mul_pd'. Requires KNCNI.

func M512MaskMulPs ¶

func M512MaskMulPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). RM.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm512_mask_mul_ps'. Requires KNCNI.

func M512MaskMulRoundPd ¶

func M512MaskMulRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskMulRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := a[i+63:i] * b[i+63:i]
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm512_mask_mul_round_pd'. Requires KNCNI.

func M512MaskMulRoundPs ¶

func M512MaskMulRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskMulRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := a[i+31:i] * b[i+31:i]
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm512_mask_mul_round_ps'. Requires KNCNI.

func M512MaskMulhiEpi32 ¶

func M512MaskMulhiEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulhiEpi32: Performs element-by-element multiplication between packed 32-bit integer elements in 'a' and 'b' and stores the high 32 bits of each result into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHD'. Intrinsic: '_mm512_mask_mulhi_epi32'. Requires KNCNI.

func M512MaskMulhiEpu32 ¶

func M512MaskMulhiEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulhiEpu32: Performs element-by-element multiplication between packed unsigned 32-bit integer elements in 'a' and 'b' and stores the high 32 bits of each result into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHUD'. Intrinsic: '_mm512_mask_mulhi_epu32'. Requires KNCNI.

func M512MaskMulloEpi32 ¶

func M512MaskMulloEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm512_mask_mullo_epi32'. Requires KNCNI.

func M512MaskOrEpi32 ¶

func M512MaskOrEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPORD'. Intrinsic: '_mm512_mask_or_epi32'. Requires KNCNI.

func M512MaskOrEpi64 ¶

func M512MaskOrEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm512_mask_or_epi64'. Requires KNCNI.

func M512MaskPermute4f128Epi32 ¶

func M512MaskPermute4f128Epi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskPermute4f128Epi32: Permutes 128-bit blocks of the packed 32-bit integer vector 'a' using constant 'imm8'. The results are stored in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control) {
	CASE control[1:0] OF
	0: tmp[127:0] := src[127:0]
	1: tmp[127:0] := src[255:128]
	2: tmp[127:0] := src[383:256]
	3: tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp[511:0] := 0
FOR j := 0 to 4
	i := j*128
	n := j*2
	tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
ENDFOR
FOR j := 0 to 15
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMF32X4'. Intrinsic: '_mm512_mask_permute4f128_epi32'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskPermute4f128Ps ¶

func M512MaskPermute4f128Ps(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskPermute4f128Ps: Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in 'a' using constant 'imm8'. The results are stored in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control) {
	CASE control[1:0] OF
	0: tmp[127:0] := src[127:0]
	1: tmp[127:0] := src[255:128]
	2: tmp[127:0] := src[383:256]
	3: tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp[511:0] := 0
FOR j := 0 to 4
	i := j*128
	n := j*2
	tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
ENDFOR
FOR j := 0 to 15
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMF32X4'. Intrinsic: '_mm512_mask_permute4f128_ps'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskPermutevarEpi32 ¶

func M512MaskPermutevarEpi32(src x86.M512i, k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskPermutevarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the 'permutevar' name. This intrinsic is identical to '_mm512_mask_permutexvar_epi32', and it is recommended that you use that intrinsic name.

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm512_mask_permutevar_epi32'. Requires KNCNI.

func M512MaskRcp23Ps ¶

func M512MaskRcp23Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskRcp23Ps: Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in 'a' to 23 bits of precision, storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP23PS'. Intrinsic: '_mm512_mask_rcp23_ps'. Requires KNCNI.

func M512MaskReduceAddEpi32 ¶

func M512MaskReduceAddEpi32(k x86.Mmask16, a x86.M512i) int

M512MaskReduceAddEpi32: Reduce the packed 32-bit integers in 'a' by addition using mask 'k'. Returns the sum of all active elements in 'a'.

sum[31:0] := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		sum[31:0] := sum[31:0] + a[i+31:i]
	FI
ENDFOR
RETURN sum[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_add_epi32'. Requires KNCNI.

func M512MaskReduceAddEpi64 ¶

func M512MaskReduceAddEpi64(k x86.Mmask8, a x86.M512i) int64

M512MaskReduceAddEpi64: Reduce the packed 64-bit integers in 'a' by addition using mask 'k'. Returns the sum of all active elements in 'a'.

sum[63:0] := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		sum[63:0] := sum[63:0] + a[i+63:i]
	FI
ENDFOR
RETURN sum[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_add_epi64'. Requires KNCNI.

func M512MaskReduceAddPd ¶

func M512MaskReduceAddPd(k x86.Mmask8, a x86.M512d) float64

M512MaskReduceAddPd: Reduce the packed double-precision (64-bit) floating-point elements in 'a' by addition using mask 'k'. Returns the sum of all active elements in 'a'.

sum[63:0] := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		sum[63:0] := sum[63:0] + a[i+63:i]
	FI
ENDFOR
RETURN sum[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_add_pd'. Requires KNCNI.

func M512MaskReduceAddPs ¶

func M512MaskReduceAddPs(k x86.Mmask16, a x86.M512) float32

M512MaskReduceAddPs: Reduce the packed single-precision (32-bit) floating-point elements in 'a' by addition using mask 'k'. Returns the sum of all active elements in 'a'.

sum[31:0] := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		sum[31:0] := sum[31:0] + a[i+31:i]
	FI
ENDFOR
RETURN sum[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_add_ps'. Requires KNCNI.

func M512MaskReduceAndEpi32 ¶

func M512MaskReduceAndEpi32(k x86.Mmask16, a x86.M512i) int

M512MaskReduceAndEpi32: Reduce the packed 32-bit integers in 'a' by bitwise AND using mask 'k'. Returns the bitwise AND of all active elements in 'a'.

reduced[31:0] := 0xFFFFFFFF
FOR j := 0 to 15
	i := j*32
	IF k[j]
		reduced[31:0] := reduced[31:0] AND a[i+31:i]
	FI
ENDFOR
RETURN reduced[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_and_epi32'. Requires KNCNI.

func M512MaskReduceAndEpi64 ¶

func M512MaskReduceAndEpi64(k x86.Mmask8, a x86.M512i) int64

M512MaskReduceAndEpi64: Reduce the packed 64-bit integers in 'a' by bitwise AND using mask 'k'. Returns the bitwise AND of all active elements in 'a'.

reduced[63:0] := 0xFFFFFFFFFFFFFFFF
FOR j := 0 to 7
	i := j*64
	IF k[j]
		reduced[63:0] := reduced[63:0] AND a[i+63:i]
	FI
ENDFOR
RETURN reduced[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_and_epi64'. Requires KNCNI.

func M512MaskReduceGmaxPd ¶

func M512MaskReduceGmaxPd(k x86.Mmask8, a x86.M512d) float64

M512MaskReduceGmaxPd: Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in 'a' and stores the result in 'dst'. Bitmask 'k' is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).

max = a[63:0]
FOR j := 1 to 7
	i := j*64
	IF k[j]
		CONTINUE
	ELSE
		dst = FpMax(max, a[i+63:i])
	FI
ENDFOR
dst := max

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_gmax_pd'. Requires KNCNI.

func M512MaskReduceGmaxPs ¶

func M512MaskReduceGmaxPs(k x86.Mmask16, a x86.M512) float32

M512MaskReduceGmaxPs: Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in 'a' and stores the result in 'dst'. Bitmask 'k' is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).

max = a[31:0]
FOR j := 1 to 15
	i := j*32
	IF k[j]
		CONTINUE
	ELSE
		dst = FpMax(max, a[i+31:i])
	FI
ENDFOR
dst := max

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_gmax_ps'. Requires KNCNI.

func M512MaskReduceGminPd ¶

func M512MaskReduceGminPd(k x86.Mmask8, a x86.M512d) float64

M512MaskReduceGminPd: Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in 'a' and stores the result in 'dst'. Bitmask 'k' is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).

min = a[63:0]
FOR j := 1 to 7
	i := j*64
	IF k[j]
		CONTINUE
	ELSE
		dst = FpMin(min, a[i+63:i])
	FI
ENDFOR
dst := min

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_gmin_pd'. Requires KNCNI.

func M512MaskReduceGminPs ¶

func M512MaskReduceGminPs(k x86.Mmask16, a x86.M512) float32

M512MaskReduceGminPs: Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in 'a' and stores the result in 'dst' using writemask 'k' (elements are ignored when the corresponding mask bit is not set).

min = a[31:0]
FOR j := 1 to 15
	i := j*32
	IF k[j]
		CONTINUE
	ELSE
		dst = FpMin(min, a[i+31:i])
	FI
ENDFOR
dst := min

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_gmin_ps'. Requires KNCNI.

func M512MaskReduceMaxEpi32 ¶

func M512MaskReduceMaxEpi32(k x86.Mmask16, a x86.M512i) int

M512MaskReduceMaxEpi32: Reduce the packed 32-bit integers in 'a' by maximum using mask 'k'. Returns the maximum of all active elements in 'a'.

max[31:0] := MIN_INT
FOR j := 0 to 15
	i := j*32
	IF k[j]
		max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
	FI
ENDFOR
RETURN max[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_max_epi32'. Requires KNCNI.

func M512MaskReduceMaxEpi64 ¶

func M512MaskReduceMaxEpi64(k x86.Mmask8, a x86.M512i) int64

M512MaskReduceMaxEpi64: Reduce the packed 64-bit integers in 'a' by maximum using mask 'k'. Returns the maximum of all active elements in 'a'.

max[63:0] := MIN_INT
FOR j := 0 to 7
	i := j*64
	IF k[j]
		max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
	FI
ENDFOR
RETURN max[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_max_epi64'. Requires KNCNI.

func M512MaskReduceMaxEpu32 ¶

func M512MaskReduceMaxEpu32(k x86.Mmask16, a x86.M512i) uint32

M512MaskReduceMaxEpu32: Reduce the packed unsigned 32-bit integers in 'a' by maximum using mask 'k'. Returns the maximum of all active elements in 'a'.

max[31:0] := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
	FI
ENDFOR
RETURN max[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_max_epu32'. Requires KNCNI.

func M512MaskReduceMaxEpu64 ¶

func M512MaskReduceMaxEpu64(k x86.Mmask8, a x86.M512i) uint64

M512MaskReduceMaxEpu64: Reduce the packed unsigned 64-bit integers in 'a' by maximum using mask 'k'. Returns the maximum of all active elements in 'a'.

max[63:0] := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
	FI
ENDFOR
RETURN max[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_max_epu64'. Requires KNCNI.

func M512MaskReduceMaxPd ¶

func M512MaskReduceMaxPd(k x86.Mmask8, a x86.M512d) float64

M512MaskReduceMaxPd: Reduce the packed double-precision (64-bit) floating-point elements in 'a' by maximum using mask 'k'. Returns the maximum of all active elements in 'a'.

max[63:0] := MIN_DOUBLE
FOR j := 0 to 7
	i := j*64
	IF k[j]
		max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
	FI
ENDFOR
RETURN max[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_max_pd'. Requires KNCNI.

func M512MaskReduceMaxPs ¶

func M512MaskReduceMaxPs(k x86.Mmask16, a x86.M512) float32

M512MaskReduceMaxPs: Reduce the packed single-precision (32-bit) floating-point elements in 'a' by maximum using mask 'k'. Returns the maximum of all active elements in 'a'.

max[31:0] := MIN_FLOAT
FOR j := 0 to 15
	i := j*32
	IF k[j]
		max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
	FI
ENDFOR
RETURN max[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_max_ps'. Requires KNCNI.

func M512MaskReduceMinEpi32 ¶

func M512MaskReduceMinEpi32(k x86.Mmask16, a x86.M512i) int

M512MaskReduceMinEpi32: Reduce the packed 32-bit integers in 'a' by maximum using mask 'k'. Returns the minimum of all active elements in 'a'.

min[31:0] := MAX_INT
FOR j := 0 to 15
	i := j*32
	IF k[j]
		min[31:0] := MINIMUM(min[31:0], a[i+31:i])
	FI
ENDFOR
RETURN min[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_min_epi32'. Requires KNCNI.

func M512MaskReduceMinEpi64 ¶

func M512MaskReduceMinEpi64(k x86.Mmask8, a x86.M512i) int64

M512MaskReduceMinEpi64: Reduce the packed 64-bit integers in 'a' by maximum using mask 'k'. Returns the minimum of all active elements in 'a'.

min[63:0] := MAX_INT
FOR j := 0 to 7
	i := j*64
	IF k[j]
		min[63:0] := MINIMUM(min[63:0], a[i+63:i])
	FI
ENDFOR
RETURN min[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_min_epi64'. Requires KNCNI.

func M512MaskReduceMinEpu32 ¶

func M512MaskReduceMinEpu32(k x86.Mmask16, a x86.M512i) uint32

M512MaskReduceMinEpu32: Reduce the packed unsigned 32-bit integers in 'a' by maximum using mask 'k'. Returns the minimum of all active elements in 'a'.

min[31:0] := MAX_UINT
FOR j := 0 to 15
	i := j*32
	IF k[j]
		min[31:0] := MINIMUM(min[31:0], a[i+31:i])
	FI
ENDFOR
RETURN min[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_min_epu32'. Requires KNCNI.

func M512MaskReduceMinEpu64 ¶

func M512MaskReduceMinEpu64(k x86.Mmask8, a x86.M512i) uint64

M512MaskReduceMinEpu64: Reduce the packed unsigned 64-bit integers in 'a' by minimum using mask 'k'. Returns the minimum of all active elements in 'a'.

min[63:0] := MAX_UINT
FOR j := 0 to 7
	i := j*64
	IF k[j]
		min[63:0] := MINIMUM(min[63:0], a[i+63:i])
	FI
ENDFOR
RETURN min[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_min_epu64'. Requires KNCNI.

func M512MaskReduceMinPd ¶

func M512MaskReduceMinPd(k x86.Mmask8, a x86.M512d) float64

M512MaskReduceMinPd: Reduce the packed double-precision (64-bit) floating-point elements in 'a' by maximum using mask 'k'. Returns the minimum of all active elements in 'a'.

min[63:0] := MAX_DOUBLE
FOR j := 0 to 7
	i := j*64
	IF k[j]
		min[63:0] := MINIMUM(min[63:0], a[i+63:i])
	FI
ENDFOR
RETURN min[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_min_pd'. Requires KNCNI.

func M512MaskReduceMinPs ¶

func M512MaskReduceMinPs(k x86.Mmask16, a x86.M512) float32

M512MaskReduceMinPs: Reduce the packed single-precision (32-bit) floating-point elements in 'a' by maximum using mask 'k'. Returns the minimum of all active elements in 'a'.

min[31:0] := MAX_FLOAT
FOR j := 0 to 15
	i := j*32
	IF k[j]
		min[31:0] := MINIMUM(min[31:0], a[i+31:i])
	FI
ENDFOR
RETURN min[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_min_ps'. Requires KNCNI.

func M512MaskReduceMulEpi32 ¶

func M512MaskReduceMulEpi32(k x86.Mmask16, a x86.M512i) int

M512MaskReduceMulEpi32: Reduce the packed 32-bit integers in 'a' by multiplication using mask 'k'. Returns the product of all active elements in 'a'.

prod[31:0] := 1
FOR j := 0 to 15
	i := j*32
	IF k[j]
		prod[31:0] := prod[31:0] * a[i+31:i]
	FI
ENDFOR
RETURN prod[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_mul_epi32'. Requires KNCNI.

func M512MaskReduceMulEpi64 ¶

func M512MaskReduceMulEpi64(k x86.Mmask8, a x86.M512i) int64

M512MaskReduceMulEpi64: Reduce the packed 64-bit integers in 'a' by multiplication using mask 'k'. Returns the product of all active elements in 'a'.

prod[63:0] := 1
FOR j := 0 to 7
	i := j*64
	IF k[j]
		prod[63:0] := prod[63:0] * a[i+63:i]
	FI
ENDFOR
RETURN prod[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_mul_epi64'. Requires KNCNI.

func M512MaskReduceMulPd ¶

func M512MaskReduceMulPd(k x86.Mmask8, a x86.M512d) float64

M512MaskReduceMulPd: Reduce the packed double-precision (64-bit) floating-point elements in 'a' by multiplication using mask 'k'. Returns the product of all active elements in 'a'.

prod[63:0] := 1
FOR j := 0 to 7
	i := j*64
	IF k[j]
		prod[63:0] := prod[63:0] * a[i+63:i]
	FI
ENDFOR
RETURN prod[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_mul_pd'. Requires KNCNI.

func M512MaskReduceMulPs ¶

func M512MaskReduceMulPs(k x86.Mmask16, a x86.M512) float32

M512MaskReduceMulPs: Reduce the packed single-precision (32-bit) floating-point elements in 'a' by multiplication using mask 'k'. Returns the product of all active elements in 'a'.

prod[31:0] := 1
FOR j := 0 to 15
	i := j*32
	IF k[j]
		prod[31:0] := prod[31:0] * a[i+31:i]
	FI
ENDFOR
RETURN prod[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_mul_ps'. Requires KNCNI.

func M512MaskReduceOrEpi32 ¶

func M512MaskReduceOrEpi32(k x86.Mmask16, a x86.M512i) int

M512MaskReduceOrEpi32: Reduce the packed 32-bit integers in 'a' by bitwise OR using mask 'k'. Returns the bitwise OR of all active elements in 'a'.

reduced[31:0] := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		reduced[31:0] := reduced[31:0] OR a[i+31:i]
	FI
ENDFOR
RETURN reduced[31:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_or_epi32'. Requires KNCNI.

func M512MaskReduceOrEpi64 ¶

func M512MaskReduceOrEpi64(k x86.Mmask8, a x86.M512i) int64

M512MaskReduceOrEpi64: Reduce the packed 64-bit integers in 'a' by bitwise OR using mask 'k'. Returns the bitwise OR of all active elements in 'a'.

reduced[63:0] := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		reduced[63:0] := reduced[63:0] OR a[i+63:i]
	FI
ENDFOR
RETURN reduced[63:0]

Instruction: '...'. Intrinsic: '_mm512_mask_reduce_or_epi64'. Requires KNCNI.

func M512MaskRoundPs ¶

func M512MaskRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512)

M512MaskRoundPs: Round the packed single-precision (32-bit) floating-point elements in 'a' to the nearest integer value using 'expadj' and in the direction of 'rounding', and store the results as packed single-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ROUND(a[i+31:i])
				CASE expadj OF
				_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
				_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
				_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
				_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
				_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
				_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
				_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
				_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
				ESAC
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VROUNDPS'. Intrinsic: '_mm512_mask_round_ps'. Requires KNCNI.

func M512MaskRoundfxpntAdjustPd ¶

func M512MaskRoundfxpntAdjustPd(src x86.M512d, k x86.Mmask8, a x86.M512d, rounding int, expadj MMEXPADJENUM) (dst x86.M512d)

M512MaskRoundfxpntAdjustPd: Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in 'a' using 'expadj' and in the direction of 'rounding' and stores results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ROUND(a[i+63:i])
				CASE expadj OF
				_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
				_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
				_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
				_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
				_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
				_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
				_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
				_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
				ESAC
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDFXPNTPD'. Intrinsic: '_mm512_mask_roundfxpnt_adjust_pd'. Requires KNCNI.

func M512MaskRoundfxpntAdjustPs ¶

func M512MaskRoundfxpntAdjustPs(src x86.M512, k x86.Mmask16, a x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512)

M512MaskRoundfxpntAdjustPs: Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in 'a' using 'expadj' and in the direction of 'rounding' and stores results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ROUND(a[i+31:i])
				CASE expadj OF
				_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
				_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
				_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
				_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
				_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
				_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
				_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
				_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
				ESAC
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDFXPNTPS'. Intrinsic: '_mm512_mask_roundfxpnt_adjust_ps'. Requires KNCNI.

func M512MaskRsqrt23Ps ¶

func M512MaskRsqrt23Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskRsqrt23Ps: Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a' to 23 bits of accuracy and stores the result in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := Sqrt(1.0 / a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT23PS'. Intrinsic: '_mm512_mask_rsqrt23_ps'. Requires KNCNI.

func M512MaskSbbEpi32 ¶

func M512MaskSbbEpi32(v2 x86.M512i, k1 x86.Mmask16, k2 x86.Mmask16, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)

M512MaskSbbEpi32: Performs element-by-element three-input subtraction of packed 32-bit integer elements of 'v3' as well as the corresponding bit from 'k2' from 'v2'. The borrowed value from the subtraction difference for the nth element is written to the nth bit of 'borrow' (borrow flag). Results are stored in 'dst' using writemask 'k1' (elements are copied from 'v2' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k2[j]
		borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k2[j])
	ELSE
		dst[i+31:i] := v2[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSBBD'. Intrinsic: '_mm512_mask_sbb_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskSbbrEpi32 ¶

func M512MaskSbbrEpi32(v2 x86.M512i, k1 x86.Mmask16, k2 x86.Mmask16, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)

M512MaskSbbrEpi32: Performs element-by-element three-input subtraction of packed 32-bit integer elements of 'v2' as well as the corresponding bit from 'k2' from 'v3'. The borrowed value from the subtraction difference for the nth element is written to the nth bit of 'borrow' (borrow flag). Results are stored in 'dst' using writemask 'k1' (elements are copied from 'v2' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k2[j]
		borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
	ELSE
		dst[i+31:i] := v2[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSBBRD'. Intrinsic: '_mm512_mask_sbbr_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskScalePs ¶

func M512MaskScalePs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512i) (dst x86.M512)

M512MaskScalePs: Scales each single-precision (32-bit) floating-point element in 'a' by multiplying it by 2**exponent, where the exponenet is the corresponding 32-bit integer element in 'b', storing results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEPS'. Intrinsic: '_mm512_mask_scale_ps'. Requires KNCNI.

func M512MaskScaleRoundPs ¶

func M512MaskScaleRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512i, rounding int) (dst x86.M512)

M512MaskScaleRoundPs: Scales each single-precision (32-bit) floating-point element in 'a' by multiplying it by 2**exp, where the exp is the corresponding 32-bit integer element in 'b', storing results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). Results are rounded using constant 'rounding'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEPS'. Intrinsic: '_mm512_mask_scale_round_ps'. Requires KNCNI.

func M512MaskShuffleEpi32 ¶

func M512MaskShuffleEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm512_mask_shuffle_epi32'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskSlliEpi32 ¶

func M512MaskSlliEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm512_mask_slli_epi32'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskSllvEpi32 ¶

func M512MaskSllvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm512_mask_sllv_epi32'. Requires KNCNI.

func M512MaskSraiEpi32 ¶

func M512MaskSraiEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm512_mask_srai_epi32'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskSravEpi32 ¶

func M512MaskSravEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm512_mask_srav_epi32'. Requires KNCNI.

func M512MaskSrliEpi32 ¶

func M512MaskSrliEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm512_mask_srli_epi32'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512MaskSrlvEpi32 ¶

func M512MaskSrlvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm512_mask_srlv_epi32'. Requires KNCNI.

func M512MaskSubEpi32 ¶

func M512MaskSubEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm512_mask_sub_epi32'. Requires KNCNI.

func M512MaskSubPd ¶

func M512MaskSubPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm512_mask_sub_pd'. Requires KNCNI.

func M512MaskSubPs ¶

func M512MaskSubPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm512_mask_sub_ps'. Requires KNCNI.

func M512MaskSubRoundPd ¶

func M512MaskSubRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskSubRoundPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := a[i+63:i] - b[i+63:i]
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm512_mask_sub_round_pd'. Requires KNCNI.

func M512MaskSubRoundPs ¶

func M512MaskSubRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskSubRoundPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := a[i+31:i] - b[i+31:i]
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm512_mask_sub_round_ps'. Requires KNCNI.

func M512MaskSubrEpi32 ¶

func M512MaskSubrEpi32(src x86.M512i, k x86.Mmask16, v2 x86.M512i, v3 x86.M512i) (dst x86.M512i)

M512MaskSubrEpi32: Performs element-by-element subtraction of packed 32-bit integer elements in 'v2' from 'v3' storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set)

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBRD'. Intrinsic: '_mm512_mask_subr_epi32'. Requires KNCNI.

func M512MaskSubrPd ¶

func M512MaskSubrPd(src x86.M512d, k x86.Mmask8, v2 x86.M512d, v3 x86.M512d) (dst x86.M512d)

M512MaskSubrPd: Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in 'v2' from 'v3' storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBRPD'. Intrinsic: '_mm512_mask_subr_pd'. Requires KNCNI.

func M512MaskSubrPs ¶

func M512MaskSubrPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512) (dst x86.M512)

M512MaskSubrPs: Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in 'v2' from 'v3' storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBRPS'. Intrinsic: '_mm512_mask_subr_ps'. Requires KNCNI.

func M512MaskSubrRoundPd ¶

func M512MaskSubrRoundPd(src x86.M512d, k x86.Mmask8, v2 x86.M512d, v3 x86.M512d, rounding int) (dst x86.M512d)

M512MaskSubrRoundPd: Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in 'v2' from 'v3' storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBRPD'. Intrinsic: '_mm512_mask_subr_round_pd'. Requires KNCNI.

func M512MaskSubrRoundPs ¶

func M512MaskSubrRoundPs(src x86.M512, k x86.Mmask16, v2 x86.M512, v3 x86.M512, rounding int) (dst x86.M512)

M512MaskSubrRoundPs: Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in 'v2' from 'v3' storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBRPS'. Intrinsic: '_mm512_mask_subr_round_ps'. Requires KNCNI.

func M512MaskSubrsetbEpi32 ¶

func M512MaskSubrsetbEpi32(v2 x86.M512i, k x86.Mmask16, k_old x86.Mmask16, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)

M512MaskSubrsetbEpi32: Performs element-by-element subtraction of packed 32-bit integer elements in 'v2' from 'v3', storing the results in 'dst' and 'v2'. The borrowed value from the subtraction difference for the nth element is written to the nth bit of 'borrow' (borrow flag). Results are written using writemask 'k' (elements are copied from 'k' to 'k_old' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		diff := v3[i+31:i] - v2[i+31:i]
		borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i])
		dst[i+31:i] := diff
		v2[i+31:i] := diff
	ELSE
		borrow[j] := k_old[j]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBRSETBD'. Intrinsic: '_mm512_mask_subrsetb_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskSubsetbEpi32 ¶

func M512MaskSubsetbEpi32(v2 x86.M512i, k x86.Mmask16, k_old x86.Mmask16, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)

M512MaskSubsetbEpi32: Performs element-by-element subtraction of packed 32-bit integer elements in 'v3' from 'v2', storing the results in 'dst' and the nth borrow bit in the nth position of 'borrow' (borrow flag). Results are stored using writemask 'k' (elements are copied from 'v2' and 'k_old' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := v2[i+31:i] - v3[i+31:i]
		borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
		borrow[j] := k_old[j]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSETBD'. Intrinsic: '_mm512_mask_subsetb_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskSwizzleEpi32 ¶

func M512MaskSwizzleEpi32(src x86.M512i, k x86.Mmask16, v x86.M512i, s MMSWIZZLEENUM) (dst x86.M512i)

M512MaskSwizzleEpi32: Performs a swizzle transformation of each of the four groups of packed 4x32-bit integer elements in 'v' using swizzle parameter 's', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE s OF
_MM_SWIZ_REG_NONE:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
	FOR j := 0 to 7
		i := j*64
		IF k[j*2]
			dst[i+31:i]	:= v[i+63:i+32]
		ELSE
			dst[i+31:i]	:= src[i+31:i]
		FI
		IF k[j*2+1]
			dst[i+63:i+32] := v[i+31:i]
		ELSE
			dst[i+63:i+32] := src[i+63:i+32]
		FI
	ENDFOR
_MM_SWIZ_REG_BADC:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+95:i+64]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+127:i+96]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+31:i]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+63:i+32]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
_MM_SWIZ_REG_AAAA:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+31:i]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+31:i]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+31:i]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+31:i]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
_MM_SWIZ_REG_BBBB:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+63:i+32]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+63:i+32]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+63:i+32]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+63:i+32]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
_MM_SWIZ_REG_CCCC:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+95:i+64]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+95:i+64]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+95:i+64]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+95:i+64]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
_MM_SWIZ_REG_DDDD:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+127:i+96]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+127:i+96]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+127:i+96]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+127:i+96]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
_MM_SWIZ_REG_DACB:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+63:i+32]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+95:i+64]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+31:i]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+127:i+96]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
ESAC
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_swizzle_epi32'. Requires KNCNI.

func M512MaskSwizzleEpi64 ¶

func M512MaskSwizzleEpi64(src x86.M512i, k x86.Mmask8, v x86.M512i, s MMSWIZZLEENUM) (dst x86.M512i)

M512MaskSwizzleEpi64: Performs a swizzle transformation of each of the four groups of packed 4x64-bit integer elements in 'v' using swizzle parameter 's', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE s OF
_MM_SWIZ_REG_NONE:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
	FOR j := 0 to 3
		i := j*64
		IF k[j*2]
			dst[i+63:i]	 := v[i+127:i+64]
		ELSE
			dst[i+63:i]	 := src[i+63:i]
		FI
		IF k[j*2+1]
			dst[i+127:i+64] := v[i+63:i]
		ELSE
			dst[i+127:i+64] := src[i+127:i+64]
		FI
	ENDFOR
_MM_SWIZ_REG_BADC:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+191:i+128]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+255:i+192]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+63:i]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+127:i+64]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
_MM_SWIZ_REG_AAAA:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+63:i]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+63:i]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+63:i]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+63:i]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
_MM_SWIZ_REG_BBBB:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+127:i+63]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+127:i+63]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+127:i+63]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+127:i+63]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
_MM_SWIZ_REG_CCCC:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+191:i+128]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+191:i+128]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+191:i+128]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+191:i+128]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
_MM_SWIZ_REG_DDDD:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+255:i+192]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+255:i+192]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+255:i+192]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+255:i+192]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
_MM_SWIZ_REG_DACB:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+127:i+64]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+191:i+128]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+63:i]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+255:i+192]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
ESAC
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_swizzle_epi64'. Requires KNCNI.

func M512MaskSwizzlePd ¶

func M512MaskSwizzlePd(src x86.M512d, k x86.Mmask8, v x86.M512d, s MMSWIZZLEENUM) (dst x86.M512d)

M512MaskSwizzlePd: Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in 'v' using swizzle parameter 's', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE s OF
_MM_SWIZ_REG_NONE:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
	FOR j := 0 to 3
		i := j*64
		IF k[j*2]
			dst[i+63:i]	 := v[i+127:i+64]
		ELSE
			dst[i+63:i]	 := src[i+63:i]
		FI
		IF k[j*2+1]
			dst[i+127:i+64] := v[i+63:i]
		ELSE
			dst[i+127:i+64] := src[i+127:i+64]
		FI
	ENDFOR
_MM_SWIZ_REG_BADC:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+191:i+128]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+255:i+192]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+63:i]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+127:i+64]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
_MM_SWIZ_REG_AAAA:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+63:i]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+63:i]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+63:i]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+63:i]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
_MM_SWIZ_REG_BBBB:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+127:i+63]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+127:i+63]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+127:i+63]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+127:i+63]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
_MM_SWIZ_REG_CCCC:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+191:i+128]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+191:i+128]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+191:i+128]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+191:i+128]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
_MM_SWIZ_REG_DDDD:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+255:i+192]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+255:i+192]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+255:i+192]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+255:i+192]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
_MM_SWIZ_REG_DACB:
	FOR j := 0 to 1
		i := j*256
		IF k[j*4]
			dst[i+63:i]	  := v[i+127:i+64]
		ELSE
			dst[i+63:i]	  := src[i+63:i]
		FI
		IF k[j*4+1]
			dst[i+127:i+64]  := v[i+191:i+128]
		ELSE
			dst[i+127:i+64]  := src[i+127:i+64]
		FI
		IF k[j*4+2]
			dst[i+191:i+128] := v[i+63:i]
		ELSE
			dst[i+191:i+128] := src[i+191:i+128]
		FI
		IF k[j*4+3]
			dst[i+255:i+192] := v[i+255:i+192]
		ELSE
			dst[i+255:i+192] := src[i+255:i+192]
		FI
	ENDFOR
ESAC
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_swizzle_pd'. Requires KNCNI.

func M512MaskSwizzlePs ¶

func M512MaskSwizzlePs(src x86.M512, k x86.Mmask16, v x86.M512, s MMSWIZZLEENUM) (dst x86.M512)

M512MaskSwizzlePs: Performs a swizzle transformation of each of the four groups of packed 4x single-precision (32-bit) floating-point elements in 'v' using swizzle parameter 's', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE s OF
_MM_SWIZ_REG_NONE:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
	FOR j := 0 to 7
		i := j*64
		IF k[j*2]
			dst[i+31:i]	:= v[i+63:i+32]
		ELSE
			dst[i+31:i]	:= src[i+31:i]
		FI
		IF k[j*2+1]
			dst[i+63:i+32] := v[i+31:i]
		ELSE
			dst[i+63:i+32] := src[i+63:i+32]
		FI
	ENDFOR
_MM_SWIZ_REG_BADC:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+95:i+64]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+127:i+96]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+31:i]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+63:i+32]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
_MM_SWIZ_REG_AAAA:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+31:i]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+31:i]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+31:i]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+31:i]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
_MM_SWIZ_REG_BBBB:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+63:i+32]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+63:i+32]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+63:i+32]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+63:i+32]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
_MM_SWIZ_REG_CCCC:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+95:i+64]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+95:i+64]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+95:i+64]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+95:i+64]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
_MM_SWIZ_REG_DDDD:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+127:i+96]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+127:i+96]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+127:i+96]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+127:i+96]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
_MM_SWIZ_REG_DACB:
	FOR j := 0 to 3
		i := j*128
		IF k[j*4]
			dst[i+31:i]	 := v[i+63:i+32]
		ELSE
			dst[i+31:i]	 := src[i+31:i]
		FI
		IF k[j*4+1]
			dst[i+63:i+32]  := v[i+95:i+64]
		ELSE
			dst[i+63:i+32]  := src[i+63:i+32]
		FI
		IF k[j*4+2]
			dst[i+95:i+64]  := v[i+31:i]
		ELSE
			dst[i+95:i+64]  := src[i+95:i+64]
		FI
		IF k[j*4+3]
			dst[i+127:i+96] := v[i+127:i+96]
		ELSE
			dst[i+127:i+96] := src[i+127:i+96]
		FI
	ENDFOR
ESAC
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_swizzle_ps'. Requires KNCNI.

func M512MaskTestEpi32Mask ¶

func M512MaskTestEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskTestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTMD'. Intrinsic: '_mm512_mask_test_epi32_mask'. Requires KNCNI.

func M512MaskXorEpi32 ¶

func M512MaskXorEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm512_mask_xor_epi32'. Requires KNCNI.

func M512MaskXorEpi64 ¶

func M512MaskXorEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_mask_xor_epi64'. Requires KNCNI.

func M512MaxEpi32 ¶

func M512MaxEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF a[i+31:i] > b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm512_max_epi32'. Requires KNCNI.

func M512MaxEpu32 ¶

func M512MaxEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF a[i+31:i] > b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm512_max_epu32'. Requires KNCNI.

func M512MaxabsPs ¶

func M512MaxabsPs(a x86.M512, b x86.M512) (dst x86.M512)

M512MaxabsPs: Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in 'a' and 'b', storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGMAXABSPS'. Intrinsic: '_mm512_maxabs_ps'. Requires KNCNI.

func M512MinEpi32 ¶

func M512MinEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF a[i+31:i] < b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm512_min_epi32'. Requires KNCNI.

func M512MinEpu32 ¶

func M512MinEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF a[i+31:i] < b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm512_min_epu32'. Requires KNCNI.

func M512MulPd ¶

func M512MulPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] * b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm512_mul_pd'. Requires KNCNI.

func M512MulPs ¶

func M512MulPs(a x86.M512, b x86.M512) (dst x86.M512)

M512MulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm512_mul_ps'. Requires KNCNI.

func M512MulRoundPd ¶

func M512MulRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MulRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := a[i+63:i] * b[i+63:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm512_mul_round_pd'. Requires KNCNI.

func M512MulRoundPs ¶

func M512MulRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MulRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := a[i+31:i] * b[i+31:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm512_mul_round_ps'. Requires KNCNI.

func M512MulhiEpi32 ¶

func M512MulhiEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulhiEpi32: Performs element-by-element multiplication between packed 32-bit integer elements in 'a' and 'b' and stores the high 32 bits of each result into 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHD'. Intrinsic: '_mm512_mulhi_epi32'. Requires KNCNI.

func M512MulhiEpu32 ¶

func M512MulhiEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulhiEpu32: Performs element-by-element multiplication between packed unsigned 32-bit integer elements in 'a' and 'b' and stores the high 32 bits of each result into 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHUD'. Intrinsic: '_mm512_mulhi_epu32'. Requires KNCNI.

func M512MulloEpi32 ¶

func M512MulloEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst'.

FOR j := 0 to 15
	i := j*32
	tmp[63:0] := a[i+31:i] * b[i+31:i]
	dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm512_mullo_epi32'. Requires KNCNI.

func M512OrEpi32 ¶

func M512OrEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512OrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPORD'. Intrinsic: '_mm512_or_epi32'. Requires KNCNI.

func M512OrEpi64 ¶

func M512OrEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512OrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the resut in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm512_or_epi64'. Requires KNCNI.

func M512OrSi512 ¶

func M512OrSi512(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512OrSi512: Compute the bitwise OR of 512 bits (representing integer data) in 'a' and 'b', and store the result in 'dst'.

dst[511:0] := (a[511:0] OR b[511:0])
dst[MAX:512] := 0

Instruction: 'VPORD'. Intrinsic: '_mm512_or_si512'. Requires KNCNI.

func M512Permute4f128Epi32 ¶

func M512Permute4f128Epi32(a x86.M512i, imm8 byte) (dst x86.M512i)

M512Permute4f128Epi32: Permutes 128-bit blocks of the packed 32-bit integer vector 'a' using constant 'imm8'. The results are stored in 'dst'.

SELECT4(src, control) {
	CASE control[1:0] OF
	0: tmp[127:0] := src[127:0]
	1: tmp[127:0] := src[255:128]
	2: tmp[127:0] := src[383:256]
	3: tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

FOR j := 0 to 3
	i := j*128
	n := j*2
	dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMF32X4'. Intrinsic: '_mm512_permute4f128_epi32'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512Permute4f128Ps ¶

func M512Permute4f128Ps(a x86.M512, imm8 byte) (dst x86.M512)

M512Permute4f128Ps: Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in 'a' using constant 'imm8'. The results are stored in 'dst'.

SELECT4(src, control) {
	CASE control[1:0] OF
	0: tmp[127:0] := src[127:0]
	1: tmp[127:0] := src[255:128]
	2: tmp[127:0] := src[383:256]
	3: tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

FOR j := 0 to 3
	i := j*128
	n := j*2
	dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMF32X4'. Intrinsic: '_mm512_permute4f128_ps'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512PermutevarEpi32 ¶

func M512PermutevarEpi32(idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512PermutevarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the 'permutevar' name. This intrinsic is identical to '_mm512_permutexvar_epi32', and it is recommended that you use that intrinsic name.

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm512_permutevar_epi32'. Requires KNCNI.

func M512Rcp23Ps ¶

func M512Rcp23Ps(a x86.M512) (dst x86.M512)

M512Rcp23Ps: Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in 'a' to 23 bits of precision, storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP23PS'. Intrinsic: '_mm512_rcp23_ps'. Requires KNCNI.

func M512ReduceAddEpi32 ¶

func M512ReduceAddEpi32(a x86.M512i) int

M512ReduceAddEpi32: Reduce the packed 32-bit integers in 'a' by addition. Returns the sum of all elements in 'a'.

sum[31:0] := 0
FOR j := 0 to 15
	i := j*32
	sum[31:0] := sum[31:0] + a[i+31:i]
ENDFOR
RETURN sum[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_add_epi32'. Requires KNCNI.

func M512ReduceAddEpi64 ¶

func M512ReduceAddEpi64(a x86.M512i) int64

M512ReduceAddEpi64: Reduce the packed 64-bit integers in 'a' by addition. Returns the sum of all elements in 'a'.

sum[63:0] := 0
FOR j := 0 to 7
	i := j*64
	sum[63:0] := sum[63:0] + a[i+63:i]
ENDFOR
RETURN sum[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_add_epi64'. Requires KNCNI.

func M512ReduceAddPd ¶

func M512ReduceAddPd(a x86.M512d) float64

M512ReduceAddPd: Reduce the packed double-precision (64-bit) floating-point elements in 'a' by addition. Returns the sum of all elements in 'a'.

sum[63:0] := 0
FOR j := 0 to 7
	i := j*64
	sum[63:0] := sum[63:0] + a[i+63:i]
ENDFOR
RETURN sum[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_add_pd'. Requires KNCNI.

func M512ReduceAddPs ¶

func M512ReduceAddPs(a x86.M512) float32

M512ReduceAddPs: Reduce the packed single-precision (32-bit) floating-point elements in 'a' by addition. Returns the sum of all elements in 'a'.

sum[31:0] := 0
FOR j := 0 to 15
	i := j*32
	sum[31:0] := sum[31:0] + a[i+31:i]
ENDFOR
RETURN sum[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_add_ps'. Requires KNCNI.

func M512ReduceAndEpi32 ¶

func M512ReduceAndEpi32(a x86.M512i) int

M512ReduceAndEpi32: Reduce the packed 32-bit integers in 'a' by bitwise AND. Returns the bitwise AND of all elements in 'a'.

reduced[31:0] := 0xFFFFFFFF
FOR j := 0 to 15
	i := j*32
	reduced[31:0] := reduced[31:0] AND a[i+31:i]
ENDFOR
RETURN reduced[31:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_and_epi32'. Requires KNCNI.

func M512ReduceAndEpi64 ¶

func M512ReduceAndEpi64(a x86.M512i) int64

M512ReduceAndEpi64: Reduce the packed 64-bit integers in 'a' by bitwise AND. Returns the bitwise AND of all elements in 'a'.

reduced[63:0] := 0xFFFFFFFFFFFFFFFF
FOR j := 0 to 7
	i := j*64
	reduced[63:0] := reduced[63:0] AND a[i+63:i]
ENDFOR
RETURN reduced[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_and_epi64'. Requires KNCNI.

func M512ReduceGmaxPd ¶

func M512ReduceGmaxPd(a x86.M512d) float64

M512ReduceGmaxPd: Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in 'a' and stores the result in 'dst'.

max = a[63:0]
FOR j := 1 to 7
	i := j*64
	dst = FpMax(max, a[i+63:i])
ENDFOR
dst := max

Instruction: '...'. Intrinsic: '_mm512_reduce_gmax_pd'. Requires KNCNI.

func M512ReduceGmaxPs ¶

func M512ReduceGmaxPs(a x86.M512) float32

M512ReduceGmaxPs: Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in 'a' and stores the result in 'dst'.

max = a[31:0]
FOR j := 1 to 15
	i := j*32
	dst = FpMax(max, a[i+31:i])
ENDFOR
dst := max

Instruction: '...'. Intrinsic: '_mm512_reduce_gmax_ps'. Requires KNCNI.

func M512ReduceGminPd ¶

func M512ReduceGminPd(a x86.M512d) float64

M512ReduceGminPd: Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in 'a' and stores the result in 'dst'.

min = a[63:0]
FOR j := 1 to 7
	i := j*64
	dst = FpMin(min, a[i+63:i])
ENDFOR
dst := min

Instruction: '...'. Intrinsic: '_mm512_reduce_gmin_pd'. Requires KNCNI.

func M512ReduceGminPs ¶

func M512ReduceGminPs(a x86.M512) float32

M512ReduceGminPs: Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in 'a' and stores the result in 'dst'.

min = a[31:0]
FOR j := 1 to 15
	i := j*32
	dst = FpMin(min, a[i+31:i])
ENDFOR
dst := min

Instruction: '...'. Intrinsic: '_mm512_reduce_gmin_ps'. Requires KNCNI.

func M512ReduceMaxEpi32 ¶

func M512ReduceMaxEpi32(a x86.M512i) int

M512ReduceMaxEpi32: Reduce the packed 32-bit integers in 'a' by maximum. Returns the maximum of all elements in 'a'.

max[31:0] := MIN_INT
FOR j := 0 to 15
	i := j*32
	max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
ENDFOR
RETURN max[31:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_max_epi32'. Requires KNCNI.

func M512ReduceMaxEpi64 ¶

func M512ReduceMaxEpi64(a x86.M512i) int64

M512ReduceMaxEpi64: Reduce the packed 64-bit integers in 'a' by maximum. Returns the maximum of all elements in 'a'.

max[63:0] := MIN_INT
FOR j := 0 to 7
	i := j*64
	max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
ENDFOR
RETURN max[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_max_epi64'. Requires KNCNI.

func M512ReduceMaxEpu32 ¶

func M512ReduceMaxEpu32(a x86.M512i) uint32

M512ReduceMaxEpu32: Reduce the packed unsigned 32-bit integers in 'a' by maximum. Returns the maximum of all elements in 'a'.

max[31:0] := 0
FOR j := 0 to 15
	i := j*32
	max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
ENDFOR
RETURN max[31:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_max_epu32'. Requires KNCNI.

func M512ReduceMaxEpu64 ¶

func M512ReduceMaxEpu64(a x86.M512i) uint64

M512ReduceMaxEpu64: Reduce the packed unsigned 64-bit integers in 'a' by maximum. Returns the maximum of all elements in 'a'.

max[63:0] := 0
FOR j := 0 to 7
	i := j*64
	max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
ENDFOR
RETURN max[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_max_epu64'. Requires KNCNI.

func M512ReduceMaxPd ¶

func M512ReduceMaxPd(a x86.M512d) float64

M512ReduceMaxPd: Reduce the packed double-precision (64-bit) floating-point elements in 'a' by maximum. Returns the maximum of all elements in 'a'.

max[63:0] := MIN_DOUBLE
FOR j := 0 to 7
	i := j*64
	max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
ENDFOR
RETURN max[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_max_pd'. Requires KNCNI.

func M512ReduceMaxPs ¶

func M512ReduceMaxPs(a x86.M512) float32

M512ReduceMaxPs: Reduce the packed single-precision (32-bit) floating-point elements in 'a' by maximum. Returns the maximum of all elements in 'a'.

max[31:0] := MIN_FLOAT
FOR j := 0 to 15
	i := j*32
	max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
ENDFOR
RETURN max[31:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_max_ps'. Requires KNCNI.

func M512ReduceMinEpi32 ¶

func M512ReduceMinEpi32(a x86.M512i) int

M512ReduceMinEpi32: Reduce the packed 32-bit integers in 'a' by minimum. Returns the minimum of all elements in 'a'.

min[31:0] := MAX_INT
FOR j := 0 to 15
	i := j*32
	min[31:0] := MINIMUM(min[31:0], a[i+31:i])
ENDFOR
RETURN min[31:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_min_epi32'. Requires KNCNI.

func M512ReduceMinEpi64 ¶

func M512ReduceMinEpi64(a x86.M512i) int64

M512ReduceMinEpi64: Reduce the packed 64-bit integers in 'a' by minimum. Returns the minimum of all elements in 'a'.

min[63:0] := MAX_INT
FOR j := 0 to 7
	i := j*64
	min[63:0] := MINIMUM(min[63:0], a[i+63:i])
ENDFOR
RETURN min[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_min_epi64'. Requires KNCNI.

func M512ReduceMinEpu32 ¶

func M512ReduceMinEpu32(a x86.M512i) uint32

M512ReduceMinEpu32: Reduce the packed unsigned 32-bit integers in 'a' by minimum. Returns the minimum of all elements in 'a'.

min[31:0] := MAX_UINT
FOR j := 0 to 15
	i := j*32
	min[31:0] := MINIMUM(min[31:0], a[i+31:i])
ENDFOR
RETURN min[31:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_min_epu32'. Requires KNCNI.

func M512ReduceMinEpu64 ¶

func M512ReduceMinEpu64(a x86.M512i) uint64

M512ReduceMinEpu64: Reduce the packed unsigned 64-bit integers in 'a' by minimum. Returns the minimum of all elements in 'a'.

min[63:0] := MAX_UINT
FOR j := 0 to 7
	i := j*64
	min[63:0] := MINIMUM(min[63:0], a[i+63:i])
ENDFOR
RETURN min[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_min_epu64'. Requires KNCNI.

func M512ReduceMinPd ¶

func M512ReduceMinPd(a x86.M512d) float64

M512ReduceMinPd: Reduce the packed double-precision (64-bit) floating-point elements in 'a' by minimum. Returns the minimum of all elements in 'a'.

min[63:0] := MAX_DOUBLE
FOR j := 0 to 7
	i := j*64
	min[63:0] := MINIMUM(min[63:0], a[i+63:i])
ENDFOR
RETURN min[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_min_pd'. Requires KNCNI.

func M512ReduceMinPs ¶

func M512ReduceMinPs(a x86.M512) float32

M512ReduceMinPs: Reduce the packed single-precision (32-bit) floating-point elements in 'a' by minimum. Returns the minimum of all elements in 'a'.

min[31:0] := MAX_INT
FOR j := 0 to 15
	i := j*32
	min[31:0] := MINIMUM(min[31:0], a[i+31:i])
ENDFOR
RETURN min[31:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_min_ps'. Requires KNCNI.

func M512ReduceMulEpi32 ¶

func M512ReduceMulEpi32(a x86.M512i) int

M512ReduceMulEpi32: Reduce the packed 32-bit integers in 'a' by multiplication. Returns the product of all elements in 'a'.

prod[31:0] := 1
FOR j := 0 to 15
	i := j*32
	prod[31:0] := prod[31:0] * a[i+31:i]
ENDFOR
RETURN prod[31:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_mul_epi32'. Requires KNCNI.

func M512ReduceMulEpi64 ¶

func M512ReduceMulEpi64(a x86.M512i) int64

M512ReduceMulEpi64: Reduce the packed 64-bit integers in 'a' by multiplication. Returns the product of all elements in 'a'.

prod[63:0] := 1
FOR j := 0 to 7
	i := j*64
	prod[63:0] := prod[63:0] * a[i+63:i]
ENDFOR
RETURN prod[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_mul_epi64'. Requires KNCNI.

func M512ReduceMulPd ¶

func M512ReduceMulPd(a x86.M512d) float64

M512ReduceMulPd: Reduce the packed double-precision (64-bit) floating-point elements in 'a' by multiplication. Returns the product of all elements in 'a'.

prod[63:0] := 1
FOR j := 0 to 7
	i := j*64
	prod[63:0] := prod[63:0] * a[i+63:i]
ENDFOR
RETURN prod[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_mul_pd'. Requires KNCNI.

func M512ReduceMulPs ¶

func M512ReduceMulPs(a x86.M512) float32

M512ReduceMulPs: Reduce the packed single-precision (32-bit) floating-point elements in 'a' by multiplication. Returns the product of all elements in 'a'.

prod[31:0] := 1
FOR j := 0 to 15
	i := j*32
	prod[31:0] := prod[31:0] * a[i+31:i]
ENDFOR
RETURN prod[31:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_mul_ps'. Requires KNCNI.

func M512ReduceOrEpi32 ¶

func M512ReduceOrEpi32(a x86.M512i) int

M512ReduceOrEpi32: Reduce the packed 32-bit integers in 'a' by bitwise OR. Returns the bitwise OR of all elements in 'a'.

reduced[31:0] := 0
FOR j := 0 to 15
	i := j*32
	reduced[31:0] := reduced[31:0] OR a[i+31:i]
ENDFOR
RETURN reduced[31:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_or_epi32'. Requires KNCNI.

func M512ReduceOrEpi64 ¶

func M512ReduceOrEpi64(a x86.M512i) int64

M512ReduceOrEpi64: Reduce the packed 64-bit integers in 'a' by bitwise OR. Returns the bitwise OR of all elements in 'a'.

reduced[63:0] := 0
FOR j := 0 to 7
	i := j*64
	reduced[63:0] := reduced[63:0] OR a[i+63:i]
ENDFOR
RETURN reduced[63:0]

Instruction: '...'. Intrinsic: '_mm512_reduce_or_epi64'. Requires KNCNI.

func M512RoundPs ¶

func M512RoundPs(a x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512)

M512RoundPs: Round the packed single-precision (32-bit) floating-point elements in 'a' to the nearest integer value using 'expadj' and in the direction of 'rounding', and store the results as packed single-precision floating-point elements in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := ROUND(a[i+31:i])
			CASE expadj OF
			_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
			_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
			_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
			_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
			_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
			_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
			_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
			_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
			ESAC
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VROUNDPS'. Intrinsic: '_mm512_round_ps'. Requires KNCNI.

func M512RoundfxpntAdjustPd ¶

func M512RoundfxpntAdjustPd(a x86.M512d, rounding int, expadj MMEXPADJENUM) (dst x86.M512d)

M512RoundfxpntAdjustPd: Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in 'a' using 'expadj' and in the direction of 'rounding' and stores results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := ROUND(a[i+63:i])
			CASE expadj OF
			_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
			_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
			_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
			_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
			_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
			_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
			_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
			_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
			ESAC
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDFXPNTPD'. Intrinsic: '_mm512_roundfxpnt_adjust_pd'. Requires KNCNI.

func M512RoundfxpntAdjustPs ¶

func M512RoundfxpntAdjustPs(a x86.M512, rounding int, expadj MMEXPADJENUM) (dst x86.M512)

M512RoundfxpntAdjustPs: Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in 'a' using 'expadj' and in the direction of 'rounding' and stores results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := ROUND(a[i+31:i])
			CASE expadj OF
			_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
			_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
			_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
			_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
			_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
			_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
			_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
			_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
			ESAC
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDFXPNTPS'. Intrinsic: '_mm512_roundfxpnt_adjust_ps'. Requires KNCNI.

func M512Rsqrt23Ps ¶

func M512Rsqrt23Ps(a x86.M512) (dst x86.M512)

M512Rsqrt23Ps: Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a' to 23 bits of accuracy and stores the result in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := Sqrt(1.0 / a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT23PS'. Intrinsic: '_mm512_rsqrt23_ps'. Requires KNCNI.

func M512SbbEpi32 ¶

func M512SbbEpi32(v2 x86.M512i, k x86.Mmask16, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)

M512SbbEpi32: Performs element-by-element three-input subtraction of packed 32-bit integer elements of 'v3' as well as the corresponding bit from 'k' from 'v2'. The borrowed value from the subtraction difference for the nth element is written to the nth bit of 'borrow' (borrow flag). Results are stored in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k[j]
	borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSBBD'. Intrinsic: '_mm512_sbb_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512SbbrEpi32 ¶

func M512SbbrEpi32(v2 x86.M512i, k x86.Mmask16, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)

M512SbbrEpi32: Performs element-by-element three-input subtraction of packed 32-bit integer elements of 'v2' as well as the corresponding bit from 'k' from 'v3'. The borrowed value from the subtraction difference for the nth element is written to the nth bit of 'borrow' (borrow flag). Results are stored in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k[j]
	borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSBBRD'. Intrinsic: '_mm512_sbbr_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512ScalePs ¶

func M512ScalePs(a x86.M512, b x86.M512i) (dst x86.M512)

M512ScalePs: Scales each single-precision (32-bit) floating-point element in 'a' by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in 'b', storing results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEPS'. Intrinsic: '_mm512_scale_ps'. Requires KNCNI.

func M512ScaleRoundPs ¶

func M512ScaleRoundPs(a x86.M512, b x86.M512i, rounding int) (dst x86.M512)

M512ScaleRoundPs: Scales each single-precision (32-bit) floating-point element in 'a' by multiplying it by 2**exponent, where the exponenet is the corresponding 32-bit integer element in 'b', storing results in 'dst'. Intermediate elements are rounded using 'rounding'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_scale_round_ps'. Requires KNCNI.

func M512ShuffleEpi32 ¶

func M512ShuffleEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)

M512ShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(a[255:128], imm8[5:4])
dst[255:224] := SELECT4(a[255:128], imm8[7:6])
dst[287:256] := SELECT4(a[383:256], imm8[1:0])
dst[319:288] := SELECT4(a[383:256], imm8[3:2])
dst[351:320] := SELECT4(a[383:256], imm8[5:4])
dst[383:352] := SELECT4(a[383:256], imm8[7:6])
dst[415:384] := SELECT4(a[511:384], imm8[1:0])
dst[447:416] := SELECT4(a[511:384], imm8[3:2])
dst[479:448] := SELECT4(a[511:384], imm8[5:4])
dst[511:480] := SELECT4(a[511:384], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm512_shuffle_epi32'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512SlliEpi32 ¶

func M512SlliEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF imm8[7:0] > 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm512_slli_epi32'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512SllvEpi32 ¶

func M512SllvEpi32(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm512_sllv_epi32'. Requires KNCNI.

func M512SraiEpi32 ¶

func M512SraiEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF imm8[7:0] > 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm512_srai_epi32'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512SravEpi32 ¶

func M512SravEpi32(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm512_srav_epi32'. Requires KNCNI.

func M512SrliEpi32 ¶

func M512SrliEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF imm8[7:0] > 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm512_srli_epi32'. Requires KNCNI.

FIXME: Requires compiler support (has immediate)

func M512SrlvEpi32 ¶

func M512SrlvEpi32(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm512_srlv_epi32'. Requires KNCNI.

func M512SubEpi32 ¶

func M512SubEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm512_sub_epi32'. Requires KNCNI.

func M512SubPd ¶

func M512SubPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512SubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm512_sub_pd'. Requires KNCNI.

func M512SubPs ¶

func M512SubPs(a x86.M512, b x86.M512) (dst x86.M512)

M512SubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm512_sub_ps'. Requires KNCNI.

func M512SubRoundPd ¶

func M512SubRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512SubRoundPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := a[i+63:i] - b[i+63:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm512_sub_round_pd'. Requires KNCNI.

func M512SubRoundPs ¶

func M512SubRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512SubRoundPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := a[i+31:i] - b[i+31:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm512_sub_round_ps'. Requires KNCNI.

func M512SubrEpi32 ¶

func M512SubrEpi32(v2 x86.M512i, v3 x86.M512i) (dst x86.M512i)

M512SubrEpi32: Performs element-by-element subtraction of packed 32-bit integer elements in 'v2' from 'v3' storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBRD'. Intrinsic: '_mm512_subr_epi32'. Requires KNCNI.

func M512SubrPd ¶

func M512SubrPd(v2 x86.M512d, v3 x86.M512d) (dst x86.M512d)

M512SubrPd: Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in 'v2' from 'v3' storing the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBRPD'. Intrinsic: '_mm512_subr_pd'. Requires KNCNI.

func M512SubrPs ¶

func M512SubrPs(v2 x86.M512, v3 x86.M512) (dst x86.M512)

M512SubrPs: Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in 'v2' from 'v3' storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBRPS'. Intrinsic: '_mm512_subr_ps'. Requires KNCNI.

func M512SubrRoundPd ¶

func M512SubrRoundPd(v2 x86.M512d, v3 x86.M512d, rounding int) (dst x86.M512d)

M512SubrRoundPd: Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in 'v2' from 'v3' storing the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBRPD'. Intrinsic: '_mm512_subr_round_pd'. Requires KNCNI.

func M512SubrRoundPs ¶

func M512SubrRoundPs(v2 x86.M512, v3 x86.M512, rounding int) (dst x86.M512)

M512SubrRoundPs: Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in 'v2' from 'v3' storing the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := v3[i+31:i] - v2[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBRPS'. Intrinsic: '_mm512_subr_round_ps'. Requires KNCNI.

func M512SubrsetbEpi32 ¶

func M512SubrsetbEpi32(v2 x86.M512i, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)

M512SubrsetbEpi32: Performs element-by-element subtraction of packed 32-bit integer elements in 'v2' from 'v3', storing the results in 'dst' and 'v2'. The borrowed value from the subtraction difference for the nth element is written to the nth bit of 'borrow' (borrow flag).

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
	borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBRSETBD'. Intrinsic: '_mm512_subrsetb_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512SubsetbEpi32 ¶

func M512SubsetbEpi32(v2 x86.M512i, v3 x86.M512i, borrow *x86.Mmask16) (dst x86.M512i)

M512SubsetbEpi32: Performs element-by-element subtraction of packed 32-bit integer elements in 'v3' from 'v2', storing the results in 'dst' and the nth borrow bit in the nth position of 'borrow' (borrow flag).

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := v2[i+31:i] - v3[i+31:i]
	borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSETBD'. Intrinsic: '_mm512_subsetb_epi32'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512SwizzleEpi32 ¶

func M512SwizzleEpi32(v x86.M512i, s MMSWIZZLEENUM) (dst x86.M512i)

M512SwizzleEpi32: Performs a swizzle transformation of each of the four groups of packed 4x 32-bit integer elements in 'v' using swizzle parameter 's', storing the results in 'dst'.

CASE s OF
_MM_SWIZ_REG_NONE:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
	FOR j := 0 to 7
		i := j*64
		dst[i+31:i]    := v[i+63:i+32]
		dst[i+63:i+32] := v[i+31:i]
	ENDFOR
_MM_SWIZ_REG_BADC:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]	    := v[i+95:i+64]
		dst[i+63:i+32]  := v[i+127:i+96]
		dst[i+95:i+64]  := v[i+31:i]
		dst[i+127:i+96] := v[i+63:i+32]
	ENDFOR
_MM_SWIZ_REG_AAAA:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]	    := v[i+31:i]
		dst[i+63:i+32]  := v[i+31:i]
		dst[i+95:i+64]  := v[i+31:i]
		dst[i+127:i+96] := v[i+31:i]
	ENDFOR
_MM_SWIZ_REG_BBBB:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]	    := v[i+63:i+32]
		dst[i+63:i+32]  := v[i+63:i+32]
		dst[i+95:i+64]  := v[i+63:i+32]
		dst[i+127:i+96] := v[i+63:i+32]
	ENDFOR
_MM_SWIZ_REG_CCCC:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]	    := v[i+95:i+64]
		dst[i+63:i+32]  := v[i+95:i+64]
		dst[i+95:i+64]  := v[i+95:i+64]
		dst[i+127:i+96] := v[i+95:i+64]
	ENDFOR
_MM_SWIZ_REG_DDDD:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]	    := v[i+127:i+96]
		dst[i+63:i+32]  := v[i+127:i+96]
		dst[i+95:i+64]  := v[i+127:i+96]
		dst[i+127:i+96] := v[i+127:i+96]
	ENDFOR
_MM_SWIZ_REG_DACB:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]	    := v[i+63:i+32]
		dst[i+63:i+32]  := v[i+95:i+64]
		dst[i+95:i+64]  := v[i+31:i]
		dst[i+127:i+96] := v[i+127:i+96]
	ENDFOR
ESAC
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_swizzle_epi32'. Requires KNCNI.

func M512SwizzleEpi64 ¶

func M512SwizzleEpi64(v x86.M512i, s MMSWIZZLEENUM) (dst x86.M512i)

M512SwizzleEpi64: Performs a swizzle transformation of each of the two groups of packed 4x64-bit integer elements in 'v' using swizzle parameter 's', storing the results in 'dst'.

CASE s OF
_MM_SWIZ_REG_NONE:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
	FOR j := 0 to 3
		i := j*64
		dst[i+63:i]	    := v[i+127:i+64]
		dst[i+127:i+64] := v[i+63:i]
	ENDFOR
_MM_SWIZ_REG_BADC:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]	     := v[i+191:i+128]
		dst[i+127:i+64]  := v[i+255:i+192]
		dst[i+191:i+128] := v[i+63:i]
		dst[i+255:i+192] := v[i+127:i+64]
	ENDFOR
_MM_SWIZ_REG_AAAA:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]	     := v[i+63:i]
		dst[i+127:i+64]  := v[i+63:i]
		dst[i+191:i+128] := v[i+63:i]
		dst[i+255:i+192] := v[i+63:i]
	ENDFOR
_MM_SWIZ_REG_BBBB:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]	     := v[i+127:i+63]
		dst[i+127:i+64]  := v[i+127:i+63]
		dst[i+191:i+128] := v[i+127:i+63]
		dst[i+255:i+192] := v[i+127:i+63]
	ENDFOR
_MM_SWIZ_REG_CCCC:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]	     := v[i+191:i+128]
		dst[i+127:i+64]  := v[i+191:i+128]
		dst[i+191:i+128] := v[i+191:i+128]
		dst[i+255:i+192] := v[i+191:i+128]
	ENDFOR
_MM_SWIZ_REG_DDDD:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]	     := v[i+255:i+192]
		dst[i+127:i+64]  := v[i+255:i+192]
		dst[i+191:i+128] := v[i+255:i+192]
		dst[i+255:i+192] := v[i+255:i+192]
	ENDFOR
_MM_SWIZ_REG_DACB:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]	     := v[i+127:i+64]
		dst[i+127:i+64]  := v[i+191:i+128]
		dst[i+191:i+128] := v[i+63:i]
		dst[i+255:i+192] := v[i+255:i+192]
	ENDFOR
ESAC
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_swizzle_epi64'. Requires KNCNI.

func M512SwizzlePd ¶

func M512SwizzlePd(v x86.M512d, s MMSWIZZLEENUM) (dst x86.M512d)

M512SwizzlePd: Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in 'v' using swizzle parameter 's', storing the results in 'dst'.

CASE s OF
_MM_SWIZ_REG_NONE:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
	FOR j := 0 to 3
		i := j*64
		dst[i+63:i]     := v[i+127:i+64]
		dst[i+127:i+64] := v[i+63:i]
	ENDFOR
_MM_SWIZ_REG_BADC:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]      := v[i+191:i+128]
		dst[i+127:i+64]  := v[i+255:i+192]
		dst[i+191:i+128] := v[i+63:i]
		dst[i+255:i+192] := v[i+127:i+64]
	ENDFOR
_MM_SWIZ_REG_AAAA:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]      := v[i+63:i]
		dst[i+127:i+64]  := v[i+63:i]
		dst[i+191:i+128] := v[i+63:i]
		dst[i+255:i+192] := v[i+63:i]
	ENDFOR
_MM_SWIZ_REG_BBBB:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]      := v[i+127:i+63]
		dst[i+127:i+64]  := v[i+127:i+63]
		dst[i+191:i+128] := v[i+127:i+63]
		dst[i+255:i+192] := v[i+127:i+63]
	ENDFOR
_MM_SWIZ_REG_CCCC:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]      := v[i+191:i+128]
		dst[i+127:i+64]  := v[i+191:i+128]
		dst[i+191:i+128] := v[i+191:i+128]
		dst[i+255:i+192] := v[i+191:i+128]
	ENDFOR
_MM_SWIZ_REG_DDDD:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]	     := v[i+255:i+192]
		dst[i+127:i+64]  := v[i+255:i+192]
		dst[i+191:i+128] := v[i+255:i+192]
		dst[i+255:i+192] := v[i+255:i+192]
	ENDFOR
_MM_SWIZ_REG_DACB:
	FOR j := 0 to 1
		i := j*256
		dst[i+63:i]	     := v[i+127:i+64]
		dst[i+127:i+64]  := v[i+191:i+128]
		dst[i+191:i+128] := v[i+63:i]
		dst[i+255:i+192] := v[i+255:i+192]
	ENDFOR
ESAC
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_swizzle_pd'. Requires KNCNI.

func M512SwizzlePs ¶

func M512SwizzlePs(v x86.M512, s MMSWIZZLEENUM) (dst x86.M512)

M512SwizzlePs: Performs a swizzle transformation of each of the four groups of packed 4xsingle-precision (32-bit) floating-point elements in 'v' using swizzle parameter 's', storing the results in 'dst'.

CASE s OF
_MM_SWIZ_REG_NONE:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
	dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
	FOR j := 0 to 7
		i := j*64
		dst[i+31:i]    := v[i+63:i+32]
		dst[i+63:i+32] := v[i+31:i]
	ENDFOR
_MM_SWIZ_REG_BADC:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]     := v[i+95:i+64]
		dst[i+63:i+32]  := v[i+127:i+96]
		dst[i+95:i+64]  := v[i+31:i]
		dst[i+127:i+96] := v[i+63:i+32]
	ENDFOR
_MM_SWIZ_REG_AAAA:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]     := v[i+31:i]
		dst[i+63:i+32]  := v[i+31:i]
		dst[i+95:i+64]  := v[i+31:i]
		dst[i+127:i+96] := v[i+31:i]
	ENDFOR
_MM_SWIZ_REG_BBBB:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]     := v[i+63:i+32]
		dst[i+63:i+32]  := v[i+63:i+32]
		dst[i+95:i+64]  := v[i+63:i+32]
		dst[i+127:i+96] := v[i+63:i+32]
	ENDFOR
_MM_SWIZ_REG_CCCC:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]     := v[i+95:i+64]
		dst[i+63:i+32]  := v[i+95:i+64]
		dst[i+95:i+64]  := v[i+95:i+64]
		dst[i+127:i+96] := v[i+95:i+64]
	ENDFOR
_MM_SWIZ_REG_DDDD:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]     := v[i+127:i+96]
		dst[i+63:i+32]  := v[i+127:i+96]
		dst[i+95:i+64]  := v[i+127:i+96]
		dst[i+127:i+96] := v[i+127:i+96]
	ENDFOR
_MM_SWIZ_REG_DACB:
	FOR j := 0 to 3
		i := j*128
		dst[i+31:i]     := v[i+63:i+32]
		dst[i+63:i+32]  := v[i+95:i+64]
		dst[i+95:i+64]  := v[i+31:i]
		dst[i+127:i+96] := v[i+127:i+96]
	ENDFOR
ESAC
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_swizzle_ps'. Requires KNCNI.

func M512TestEpi32Mask ¶

func M512TestEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512TestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 15
	i := j*32
	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTMD'. Intrinsic: '_mm512_test_epi32_mask'. Requires KNCNI.

func M512XorEpi32 ¶

func M512XorEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512XorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm512_xor_epi32'. Requires KNCNI.

func M512XorEpi64 ¶

func M512XorEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512XorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_xor_epi64'. Requires KNCNI.

func M512XorSi512 ¶

func M512XorSi512(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512XorSi512: Compute the bitwise XOR of 512 bits (representing integer data) in 'a' and 'b', and store the result in 'dst'.

dst[511:0] := (a[511:0] XOR b[511:0])
dst[MAX:512] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm512_xor_si512'. Requires KNCNI.

func Prefetch ¶

func Prefetch(p *byte, i int)

Prefetch: Fetch the line of data from memory that contains address 'p' to a location in the cache heirarchy specified by the locality hint 'i'.

Instruction: 'VPREFETCH0, VPREFETCH1, VPREFETCH2, VPREFETCHNTA, VPREFETCHE0, VPREFETCHE1, VPREFETCHE2, VPREFETCHENTA'. Intrinsic: '_mm_prefetch'. Requires KNCNI.

FIXME: Will likely need to be reworked (has pointer parameter).

func Spflt32 ¶

func Spflt32(r1 uint32)

Spflt32: Set performance monitoring filtering mask to 32-bit unsigned integer 'r1'.

SetPerfMonMask(r1[31:0])

Instruction: 'SPFLT'. Intrinsic: '_mm_spflt_32'. Requires KNCNI.

func Spflt64 ¶

func Spflt64(r1 uint64)

Spflt64: Set performance monitoring filtering mask to 64-bit unsigned integer 'r1'.

SetPerfMonMask(r1[63:0])

Instruction: 'SPFLT'. Intrinsic: '_mm_spflt_64'. Requires KNCNI.

func Tzcnti32 ¶

func Tzcnti32(a int, x uint32) int

Tzcnti32: Counts the number of trailing bits in unsigned 32-bit integer 'x' starting at bit 'a' storing the result in 'dst'.

count := 0
FOR j := a to 31
	IF NOT(x[j]  1)
		count := count + 1
	FI
ENDFOR
dst := count

Instruction: 'TZCNTI'. Intrinsic: '_mm_tzcnti_32'. Requires KNCNI.

func Tzcnti64 ¶

func Tzcnti64(a int64, x uint64) int64

Tzcnti64: Counts the number of trailing bits in unsigned 64-bit integer 'x' starting at bit 'a' storing the result in 'dst'.

count := 0
FOR j := a to 63
	IF NOT(x[j]  1)
		count := count + 1
	FI
ENDFOR
dst := count

Instruction: 'TZCNTI'. Intrinsic: '_mm_tzcnti_64'. Requires KNCNI.

Types ¶

This section is empty.

Source Files ¶

View all Source files

kncni.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL