avx512f

package
v0.0.0-...-3878f85 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 23, 2017 License: MIT Imports: 1 Imported by: 0

Documentation

Overview

THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!

THEY DO NOT NOT CONTAIN WORKING INTRINSICS!

See https://github.com/klauspost/intrinsics

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AbsEpi64

func AbsEpi64(a x86.M128i) (dst x86.M128i)

AbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm_abs_epi64'. Requires AVX512F.

func AddRoundSd

func AddRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

AddRoundSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := a[63:0] + b[63:0]
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VADDSD'. Intrinsic: '_mm_add_round_sd'. Requires AVX512F.

func AddRoundSs

func AddRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

AddRoundSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := a[31:0] + b[31:0]
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VADDSS'. Intrinsic: '_mm_add_round_ss'. Requires AVX512F.

func CmpEpi32Mask

func CmpEpi32Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmp_epi32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpEpi64Mask

func CmpEpi64Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpEpu32Mask

func CmpEpu32Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmp_epu32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpEpu64Mask

func CmpEpu64Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpPdMask

func CmpPdMask(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)

CmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm_cmp_pd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpPsMask

func CmpPsMask(a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)

CmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm_cmp_ps_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpRoundSdMask

func CmpRoundSdMask(a x86.M128d, b x86.M128d, imm8 byte, sae int) (dst x86.Mmask8)

CmpRoundSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0

	k[MAX:1] := 0

Instruction: 'VCMPSD'. Intrinsic: '_mm_cmp_round_sd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpRoundSsMask

func CmpRoundSsMask(a x86.M128, b x86.M128, imm8 byte, sae int) (dst x86.Mmask8)

CmpRoundSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0

	k[MAX:1] := 0

Instruction: 'VCMPSS'. Intrinsic: '_mm_cmp_round_ss_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpSdMask

func CmpSdMask(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)

CmpSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC

k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0

k[MAX:1] := 0

Instruction: 'VCMPSD'. Intrinsic: '_mm_cmp_sd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpSsMask

func CmpSsMask(a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)

CmpSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC

k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0

k[MAX:1] := 0

Instruction: 'VCMPSS'. Intrinsic: '_mm_cmp_ss_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CmpeqEpi32Mask

func CmpeqEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpeq_epi32_mask'. Requires AVX512F.

func CmpeqEpi64Mask

func CmpeqEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpeq_epi64_mask'. Requires AVX512F.

func CmpeqEpu32Mask

func CmpeqEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpeq_epu32_mask'. Requires AVX512F.

func CmpeqEpu64Mask

func CmpeqEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpeq_epu64_mask'. Requires AVX512F.

func CmpgeEpi32Mask

func CmpgeEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpge_epi32_mask'. Requires AVX512F.

func CmpgeEpi64Mask

func CmpgeEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpge_epi64_mask'. Requires AVX512F.

func CmpgeEpu32Mask

func CmpgeEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpge_epu32_mask'. Requires AVX512F.

func CmpgeEpu64Mask

func CmpgeEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpge_epu64_mask'. Requires AVX512F.

func CmpgtEpi32Mask

func CmpgtEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpgt_epi32_mask'. Requires AVX512F.

func CmpgtEpi64Mask

func CmpgtEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpgt_epi64_mask'. Requires AVX512F.

func CmpgtEpu32Mask

func CmpgtEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpgt_epu32_mask'. Requires AVX512F.

func CmpgtEpu64Mask

func CmpgtEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpgt_epu64_mask'. Requires AVX512F.

func CmpleEpi32Mask

func CmpleEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmple_epi32_mask'. Requires AVX512F.

func CmpleEpi64Mask

func CmpleEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmple_epi64_mask'. Requires AVX512F.

func CmpleEpu32Mask

func CmpleEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmple_epu32_mask'. Requires AVX512F.

func CmpleEpu64Mask

func CmpleEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmple_epu64_mask'. Requires AVX512F.

func CmpltEpi32Mask

func CmpltEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmplt_epi32_mask'. Requires AVX512F.

func CmpltEpi64Mask

func CmpltEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmplt_epi64_mask'. Requires AVX512F.

func CmpltEpu32Mask

func CmpltEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmplt_epu32_mask'. Requires AVX512F.

func CmpltEpu64Mask

func CmpltEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmplt_epu64_mask'. Requires AVX512F.

func CmpneqEpi32Mask

func CmpneqEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_cmpneq_epi32_mask'. Requires AVX512F.

func CmpneqEpi64Mask

func CmpneqEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_cmpneq_epi64_mask'. Requires AVX512F.

func CmpneqEpu32Mask

func CmpneqEpu32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*32
	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_cmpneq_epu32_mask'. Requires AVX512F.

func CmpneqEpu64Mask

func CmpneqEpu64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 1
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_cmpneq_epu64_mask'. Requires AVX512F.

func ComiRoundSd

func ComiRoundSd(a x86.M128d, b x86.M128d, imm8 byte, sae int) int

ComiRoundSd: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and return the boolean result (0 or 1).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0

Instruction: 'VCOMISD'. Intrinsic: '_mm_comi_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func ComiRoundSs

func ComiRoundSs(a x86.M128, b x86.M128, imm8 byte, sae int) int

ComiRoundSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and return the boolean result (0 or 1).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0

Instruction: 'VCOMISS'. Intrinsic: '_mm_comi_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func CvtRoundi32Ss

func CvtRoundi32Ss(a x86.M128, b int, rounding int) (dst x86.M128)

CvtRoundi32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_Int32_To_FP32(b[31:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundi32_ss'. Requires AVX512F.

func CvtRoundi64Sd

func CvtRoundi64Sd(a x86.M128d, b int64, rounding int) (dst x86.M128d)

CvtRoundi64Sd: Convert the 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_Int64_To_FP64(b[63:0])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvt_roundi64_sd'. Requires AVX512F.

func CvtRoundi64Ss

func CvtRoundi64Ss(a x86.M128, b int64, rounding int) (dst x86.M128)

CvtRoundi64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_Int64_To_FP32(b[63:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundi64_ss'. Requires AVX512F.

func CvtRoundsdI32

func CvtRoundsdI32(a x86.M128d, rounding int) int

CvtRoundsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_Int32(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_i32'. Requires AVX512F.

func CvtRoundsdI64

func CvtRoundsdI64(a x86.M128d, rounding int) int64

CvtRoundsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_Int64(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_i64'. Requires AVX512F.

func CvtRoundsdSi32

func CvtRoundsdSi32(a x86.M128d, rounding int) int

CvtRoundsdSi32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_Int32(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_si32'. Requires AVX512F.

func CvtRoundsdSi64

func CvtRoundsdSi64(a x86.M128d, rounding int) int64

CvtRoundsdSi64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_Int64(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvt_roundsd_si64'. Requires AVX512F.

func CvtRoundsdSs

func CvtRoundsdSs(a x86.M128, b x86.M128d, rounding int) (dst x86.M128)

CvtRoundsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_FP32(b[63:0])
		dst[127:32] := a[127:31]
		dst[MAX:64] := 0

Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_cvt_roundsd_ss'. Requires AVX512F.

func CvtRoundsdU32

func CvtRoundsdU32(a x86.M128d, rounding int) uint32

CvtRoundsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])

Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvt_roundsd_u32'. Requires AVX512F.

func CvtRoundsdU64

func CvtRoundsdU64(a x86.M128d, rounding int) uint64

CvtRoundsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])

Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvt_roundsd_u64'. Requires AVX512F.

func CvtRoundsi32Ss

func CvtRoundsi32Ss(a x86.M128, b int, rounding int) (dst x86.M128)

CvtRoundsi32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_Int32_To_FP32(b[31:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundsi32_ss'. Requires AVX512F.

func CvtRoundsi64Sd

func CvtRoundsi64Sd(a x86.M128d, b int64, rounding int) (dst x86.M128d)

CvtRoundsi64Sd: Convert the 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_Int64_To_FP64(b[63:0])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvt_roundsi64_sd'. Requires AVX512F.

func CvtRoundsi64Ss

func CvtRoundsi64Ss(a x86.M128, b int64, rounding int) (dst x86.M128)

CvtRoundsi64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_Int64_To_FP32(b[63:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvt_roundsi64_ss'. Requires AVX512F.

func CvtRoundssI32

func CvtRoundssI32(a x86.M128, rounding int) int

CvtRoundssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_Int32(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_i32'. Requires AVX512F.

func CvtRoundssI64

func CvtRoundssI64(a x86.M128, rounding int) int64

CvtRoundssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_Int64(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_i64'. Requires AVX512F.

func CvtRoundssSd

func CvtRoundssSd(a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)

CvtRoundssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_FP64(b[31:0])
		dst[127:64] := a[127:64]
		dst[MAX:64] := 0

Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_cvt_roundss_sd'. Requires AVX512F.

func CvtRoundssSi32

func CvtRoundssSi32(a x86.M128, rounding int) int

CvtRoundssSi32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_Int32(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_si32'. Requires AVX512F.

func CvtRoundssSi64

func CvtRoundssSi64(a x86.M128, rounding int) int64

CvtRoundssSi64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_Int64(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvt_roundss_si64'. Requires AVX512F.

func CvtRoundssU32

func CvtRoundssU32(a x86.M128, rounding int) uint32

CvtRoundssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])

Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvt_roundss_u32'. Requires AVX512F.

func CvtRoundssU64

func CvtRoundssU64(a x86.M128, rounding int) uint64

CvtRoundssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])

Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvt_roundss_u64'. Requires AVX512F.

func CvtRoundu32Ss

func CvtRoundu32Ss(a x86.M128, b uint32, rounding int) (dst x86.M128)

CvtRoundu32Ss: Convert the unsigned 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvt_roundu32_ss'. Requires AVX512F.

func CvtRoundu64Sd

func CvtRoundu64Sd(a x86.M128d, b uint64, rounding int) (dst x86.M128d)

CvtRoundu64Sd: Convert the unsigned 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VCVTUSI2SD'. Intrinsic: '_mm_cvt_roundu64_sd'. Requires AVX512F.

func CvtRoundu64Ss

func CvtRoundu64Ss(a x86.M128, b uint64, rounding int) (dst x86.M128)

CvtRoundu64Ss: Convert the unsigned 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvt_roundu64_ss'. Requires AVX512F.

func Cvtepi32Epi16

func Cvtepi32Epi16(a x86.M128i) (dst x86.M128i)

Cvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 16*j
	dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm_cvtepi32_epi16'. Requires AVX512F.

func Cvtepi32Epi8

func Cvtepi32Epi8(a x86.M128i) (dst x86.M128i)

Cvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 8*j
	dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm_cvtepi32_epi8'. Requires AVX512F.

func Cvtepi64Epi16

func Cvtepi64Epi16(a x86.M128i) (dst x86.M128i)

Cvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 16*j
	dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm_cvtepi64_epi16'. Requires AVX512F.

func Cvtepi64Epi32

func Cvtepi64Epi32(a x86.M128i) (dst x86.M128i)

Cvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 32*j
	dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm_cvtepi64_epi32'. Requires AVX512F.

func Cvtepi64Epi8

func Cvtepi64Epi8(a x86.M128i) (dst x86.M128i)

Cvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 8*j
	dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm_cvtepi64_epi8'. Requires AVX512F.

func Cvtepu32Pd

func Cvtepu32Pd(a x86.M128i) (dst x86.M128d)

Cvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm_cvtepu32_pd'. Requires AVX512F.

func Cvti32Sd

func Cvti32Sd(a x86.M128d, b int) (dst x86.M128d)

Cvti32Sd: Convert the 32-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[63:0] := Convert_Int32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvti32_sd'. Requires AVX512F.

func Cvti32Ss

func Cvti32Ss(a x86.M128, b int) (dst x86.M128)

Cvti32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvti32_ss'. Requires AVX512F.

func Cvti64Sd

func Cvti64Sd(a x86.M128d, b int64) (dst x86.M128d)

Cvti64Sd: Convert the 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[63:0] := Convert_Int64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VCVTSI2SD'. Intrinsic: '_mm_cvti64_sd'. Requires AVX512F.

func Cvti64Ss

func Cvti64Ss(a x86.M128, b int64) (dst x86.M128)

Cvti64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VCVTSI2SS'. Intrinsic: '_mm_cvti64_ss'. Requires AVX512F.

func CvtpdEpu32

func CvtpdEpu32(a x86.M128d) (dst x86.M128i)

CvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm_cvtpd_epu32'. Requires AVX512F.

func CvtpsEpu32

func CvtpsEpu32(a x86.M128) (dst x86.M128i)

CvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm_cvtps_epu32'. Requires AVX512F.

func CvtsdI32

func CvtsdI32(a x86.M128d) int

CvtsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP64_To_Int32(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvtsd_i32'. Requires AVX512F.

func CvtsdI64

func CvtsdI64(a x86.M128d) int64

CvtsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

dst[63:0] := Convert_FP64_To_Int64(a[63:0])

Instruction: 'VCVTSD2SI'. Intrinsic: '_mm_cvtsd_i64'. Requires AVX512F.

func CvtsdU32

func CvtsdU32(a x86.M128d) uint32

CvtsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])

Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvtsd_u32'. Requires AVX512F.

func CvtsdU64

func CvtsdU64(a x86.M128d) uint64

CvtsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.

dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])

Instruction: 'VCVTSD2USI'. Intrinsic: '_mm_cvtsd_u64'. Requires AVX512F.

func Cvtsepi32Epi16

func Cvtsepi32Epi16(a x86.M128i) (dst x86.M128i)

Cvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm_cvtsepi32_epi16'. Requires AVX512F.

func Cvtsepi32Epi8

func Cvtsepi32Epi8(a x86.M128i) (dst x86.M128i)

Cvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm_cvtsepi32_epi8'. Requires AVX512F.

func Cvtsepi64Epi16

func Cvtsepi64Epi16(a x86.M128i) (dst x86.M128i)

Cvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm_cvtsepi64_epi16'. Requires AVX512F.

func Cvtsepi64Epi32

func Cvtsepi64Epi32(a x86.M128i) (dst x86.M128i)

Cvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm_cvtsepi64_epi32'. Requires AVX512F.

func Cvtsepi64Epi8

func Cvtsepi64Epi8(a x86.M128i) (dst x86.M128i)

Cvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm_cvtsepi64_epi8'. Requires AVX512F.

func CvtssI32

func CvtssI32(a x86.M128) int

CvtssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvtss_i32'. Requires AVX512F.

func CvtssI64

func CvtssI64(a x86.M128) int64

CvtssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

dst[63:0] := Convert_FP32_To_Int64(a[31:0])

Instruction: 'VCVTSS2SI'. Intrinsic: '_mm_cvtss_i64'. Requires AVX512F.

func CvtssU32

func CvtssU32(a x86.M128) uint32

CvtssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])

Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvtss_u32'. Requires AVX512F.

func CvtssU64

func CvtssU64(a x86.M128) uint64

CvtssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer, and store the result in 'dst'.

dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])

Instruction: 'VCVTSS2USI'. Intrinsic: '_mm_cvtss_u64'. Requires AVX512F.

func CvttRoundsdI32

func CvttRoundsdI32(a x86.M128d, rounding int) int

CvttRoundsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_i32'. Requires AVX512F.

func CvttRoundsdI64

func CvttRoundsdI64(a x86.M128d, rounding int) int64

CvttRoundsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_i64'. Requires AVX512F.

func CvttRoundsdSi32

func CvttRoundsdSi32(a x86.M128d, rounding int) int

CvttRoundsdSi32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_si32'. Requires AVX512F.

func CvttRoundsdSi64

func CvttRoundsdSi64(a x86.M128d, rounding int) int64

CvttRoundsdSi64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvtt_roundsd_si64'. Requires AVX512F.

func CvttRoundsdU32

func CvttRoundsdU32(a x86.M128d, rounding int) uint32

CvttRoundsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])

Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvtt_roundsd_u32'. Requires AVX512F.

func CvttRoundsdU64

func CvttRoundsdU64(a x86.M128d, rounding int) uint64

CvttRoundsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])

Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvtt_roundsd_u64'. Requires AVX512F.

func CvttRoundssI32

func CvttRoundssI32(a x86.M128, rounding int) int

CvttRoundssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_i32'. Requires AVX512F.

func CvttRoundssI64

func CvttRoundssI64(a x86.M128, rounding int) int64

CvttRoundssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_i64'. Requires AVX512F.

func CvttRoundssSi32

func CvttRoundssSi32(a x86.M128, rounding int) int

CvttRoundssSi32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_si32'. Requires AVX512F.

func CvttRoundssSi64

func CvttRoundssSi64(a x86.M128, rounding int) int64

CvttRoundssSi64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvtt_roundss_si64'. Requires AVX512F.

func CvttRoundssU32

func CvttRoundssU32(a x86.M128, rounding int) uint32

CvttRoundssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])

Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvtt_roundss_u32'. Requires AVX512F.

func CvttRoundssU64

func CvttRoundssU64(a x86.M128, rounding int) uint64

CvttRoundssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])

Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvtt_roundss_u64'. Requires AVX512F.

func CvttpdEpu32

func CvttpdEpu32(a x86.M128d) (dst x86.M128i)

CvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm_cvttpd_epu32'. Requires AVX512F.

func CvttpsEpu32

func CvttpsEpu32(a x86.M128) (dst x86.M128i)

CvttpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm_cvttps_epu32'. Requires AVX512F.

func CvttsdI32

func CvttsdI32(a x86.M128d) int

CvttsdI32: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvttsd_i32'. Requires AVX512F.

func CvttsdI64

func CvttsdI64(a x86.M128d) int64

CvttsdI64: Convert the lower double-precision (64-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])

Instruction: 'VCVTTSD2SI'. Intrinsic: '_mm_cvttsd_i64'. Requires AVX512F.

func CvttsdU32

func CvttsdU32(a x86.M128d) uint32

CvttsdU32: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])

Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvttsd_u32'. Requires AVX512F.

func CvttsdU64

func CvttsdU64(a x86.M128d) uint64

CvttsdU64: Convert the lower double-precision (64-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.

dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])

Instruction: 'VCVTTSD2USI'. Intrinsic: '_mm_cvttsd_u64'. Requires AVX512F.

func CvttssI32

func CvttssI32(a x86.M128) int

CvttssI32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvttss_i32'. Requires AVX512F.

func CvttssI64

func CvttssI64(a x86.M128) int64

CvttssI64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])

Instruction: 'VCVTTSS2SI'. Intrinsic: '_mm_cvttss_i64'. Requires AVX512F.

func CvttssU32

func CvttssU32(a x86.M128) uint32

CvttssU32: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])

Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvttss_u32'. Requires AVX512F.

func CvttssU64

func CvttssU64(a x86.M128) uint64

CvttssU64: Convert the lower single-precision (32-bit) floating-point element in 'a' to an unsigned 64-bit integer with truncation, and store the result in 'dst'.

dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])

Instruction: 'VCVTTSS2USI'. Intrinsic: '_mm_cvttss_u64'. Requires AVX512F.

func Cvtu32Sd

func Cvtu32Sd(a x86.M128d, b uint32) (dst x86.M128d)

Cvtu32Sd: Convert the unsigned 32-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[63:0] := Convert_UnsignedInt32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VCVTUSI2SD'. Intrinsic: '_mm_cvtu32_sd'. Requires AVX512F.

func Cvtu32Ss

func Cvtu32Ss(a x86.M128, b uint32) (dst x86.M128)

Cvtu32Ss: Convert the unsigned 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvtu32_ss'. Requires AVX512F.

func Cvtu64Sd

func Cvtu64Sd(a x86.M128d, b uint64) (dst x86.M128d)

Cvtu64Sd: Convert the unsigned 64-bit integer 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VCVTUSI2SD'. Intrinsic: '_mm_cvtu64_sd'. Requires AVX512F.

func Cvtu64Ss

func Cvtu64Ss(a x86.M128, b uint64) (dst x86.M128)

Cvtu64Ss: Convert the unsigned 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VCVTUSI2SS'. Intrinsic: '_mm_cvtu64_ss'. Requires AVX512F.

func Cvtusepi32Epi16

func Cvtusepi32Epi16(a x86.M128i) (dst x86.M128i)

Cvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm_cvtusepi32_epi16'. Requires AVX512F.

func Cvtusepi32Epi8

func Cvtusepi32Epi8(a x86.M128i) (dst x86.M128i)

Cvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm_cvtusepi32_epi8'. Requires AVX512F.

func Cvtusepi64Epi16

func Cvtusepi64Epi16(a x86.M128i) (dst x86.M128i)

Cvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm_cvtusepi64_epi16'. Requires AVX512F.

func Cvtusepi64Epi32

func Cvtusepi64Epi32(a x86.M128i) (dst x86.M128i)

Cvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm_cvtusepi64_epi32'. Requires AVX512F.

func Cvtusepi64Epi8

func Cvtusepi64Epi8(a x86.M128i) (dst x86.M128i)

Cvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm_cvtusepi64_epi8'. Requires AVX512F.

func DivRoundSd

func DivRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

DivRoundSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := a[63:0] / b[63:0]
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VDIVSD'. Intrinsic: '_mm_div_round_sd'. Requires AVX512F.

func DivRoundSs

func DivRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

DivRoundSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := a[31:0] / b[31:0]
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VDIVSS'. Intrinsic: '_mm_div_round_ss'. Requires AVX512F.

func FixupimmPd

func FixupimmPd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

FixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func FixupimmPs

func FixupimmPs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

FixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func FixupimmRoundSd

func FixupimmRoundSd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)

FixupimmRoundSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_fixupimm_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func FixupimmRoundSs

func FixupimmRoundSs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)

FixupimmRoundSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_fixupimm_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func FixupimmSd

func FixupimmSd(a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

FixupimmSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_fixupimm_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func FixupimmSs

func FixupimmSs(a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

FixupimmSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_fixupimm_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func GetexpPd

func GetexpPd(a x86.M128d) (dst x86.M128d)

GetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm_getexp_pd'. Requires AVX512F.

func GetexpPs

func GetexpPs(a x86.M128) (dst x86.M128)

GetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm_getexp_ps'. Requires AVX512F.

func GetexpRoundSd

func GetexpRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

GetexpRoundSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := ConvertExpFP64(b[63:0])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_getexp_round_sd'. Requires AVX512F.

func GetexpRoundSs

func GetexpRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

GetexpRoundSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := ConvertExpFP32(b[31:0])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_getexp_round_ss'. Requires AVX512F.

func GetexpSd

func GetexpSd(a x86.M128d, b x86.M128d) (dst x86.M128d)

GetexpSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

dst[63:0] := ConvertExpFP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_getexp_sd'. Requires AVX512F.

func GetexpSs

func GetexpSs(a x86.M128, b x86.M128) (dst x86.M128)

GetexpSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

dst[31:0] := ConvertExpFP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_getexp_ss'. Requires AVX512F.

func GetmantPd

func GetmantPd(a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

GetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 1
			i := j*64
			dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm_getmant_pd'. Requires AVX512F.

func GetmantPs

func GetmantPs(a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

GetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*32
			dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm_getmant_ps'. Requires AVX512F.

func GetmantRoundSd

func GetmantRoundSd(a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128d)

GetmantRoundSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_getmant_round_sd'. Requires AVX512F.

func GetmantRoundSs

func GetmantRoundSs(a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128)

GetmantRoundSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_getmant_round_ss'. Requires AVX512F.

func GetmantSd

func GetmantSd(a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

GetmantSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_getmant_sd'. Requires AVX512F.

func GetmantSs

func GetmantSs(a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

GetmantSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_getmant_ss'. Requires AVX512F.

func M256AbsEpi64

func M256AbsEpi64(a x86.M256i) (dst x86.M256i)

M256AbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm256_abs_epi64'. Requires AVX512F.

func M256BroadcastF32x4

func M256BroadcastF32x4(a x86.M128) (dst x86.M256)

M256BroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm256_broadcast_f32x4'. Requires AVX512F.

func M256BroadcastI32x4

func M256BroadcastI32x4(a x86.M128i) (dst x86.M256i)

M256BroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm256_broadcast_i32x4'. Requires AVX512F.

func M256CmpEpi32Mask

func M256CmpEpi32Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256CmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmp_epi32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpEpi64Mask

func M256CmpEpi64Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256CmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpEpu32Mask

func M256CmpEpu32Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256CmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmp_epu32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpEpu64Mask

func M256CmpEpu64Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256CmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpPdMask

func M256CmpPdMask(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.Mmask8)

M256CmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
	i := j*64
	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm256_cmp_pd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpPsMask

func M256CmpPsMask(a x86.M256, b x86.M256, imm8 byte) (dst x86.Mmask8)

M256CmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
	i := j*32
	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm256_cmp_ps_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256CmpeqEpi32Mask

func M256CmpeqEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpeq_epi32_mask'. Requires AVX512F.

func M256CmpeqEpi64Mask

func M256CmpeqEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpeq_epi64_mask'. Requires AVX512F.

func M256CmpeqEpu32Mask

func M256CmpeqEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpeq_epu32_mask'. Requires AVX512F.

func M256CmpeqEpu64Mask

func M256CmpeqEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpeq_epu64_mask'. Requires AVX512F.

func M256CmpgeEpi32Mask

func M256CmpgeEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpge_epi32_mask'. Requires AVX512F.

func M256CmpgeEpi64Mask

func M256CmpgeEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpge_epi64_mask'. Requires AVX512F.

func M256CmpgeEpu32Mask

func M256CmpgeEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpge_epu32_mask'. Requires AVX512F.

func M256CmpgeEpu64Mask

func M256CmpgeEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpge_epu64_mask'. Requires AVX512F.

func M256CmpgtEpi32Mask

func M256CmpgtEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpgt_epi32_mask'. Requires AVX512F.

func M256CmpgtEpi64Mask

func M256CmpgtEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpgt_epi64_mask'. Requires AVX512F.

func M256CmpgtEpu32Mask

func M256CmpgtEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpgt_epu32_mask'. Requires AVX512F.

func M256CmpgtEpu64Mask

func M256CmpgtEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpgt_epu64_mask'. Requires AVX512F.

func M256CmpleEpi32Mask

func M256CmpleEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmple_epi32_mask'. Requires AVX512F.

func M256CmpleEpi64Mask

func M256CmpleEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmple_epi64_mask'. Requires AVX512F.

func M256CmpleEpu32Mask

func M256CmpleEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmple_epu32_mask'. Requires AVX512F.

func M256CmpleEpu64Mask

func M256CmpleEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmple_epu64_mask'. Requires AVX512F.

func M256CmpltEpi32Mask

func M256CmpltEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmplt_epi32_mask'. Requires AVX512F.

func M256CmpltEpi64Mask

func M256CmpltEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmplt_epi64_mask'. Requires AVX512F.

func M256CmpltEpu32Mask

func M256CmpltEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmplt_epu32_mask'. Requires AVX512F.

func M256CmpltEpu64Mask

func M256CmpltEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmplt_epu64_mask'. Requires AVX512F.

func M256CmpneqEpi32Mask

func M256CmpneqEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_cmpneq_epi32_mask'. Requires AVX512F.

func M256CmpneqEpi64Mask

func M256CmpneqEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_cmpneq_epi64_mask'. Requires AVX512F.

func M256CmpneqEpu32Mask

func M256CmpneqEpu32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*32
	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_cmpneq_epu32_mask'. Requires AVX512F.

func M256CmpneqEpu64Mask

func M256CmpneqEpu64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256CmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 3
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_cmpneq_epu64_mask'. Requires AVX512F.

func M256Cvtepi32Epi16

func M256Cvtepi32Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 16*j
	dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm256_cvtepi32_epi16'. Requires AVX512F.

func M256Cvtepi32Epi8

func M256Cvtepi32Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 8*j
	dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm256_cvtepi32_epi8'. Requires AVX512F.

func M256Cvtepi64Epi16

func M256Cvtepi64Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 16*j
	dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm256_cvtepi64_epi16'. Requires AVX512F.

func M256Cvtepi64Epi32

func M256Cvtepi64Epi32(a x86.M256i) (dst x86.M128i)

M256Cvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 32*j
	dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm256_cvtepi64_epi32'. Requires AVX512F.

func M256Cvtepi64Epi8

func M256Cvtepi64Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 8*j
	dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm256_cvtepi64_epi8'. Requires AVX512F.

func M256Cvtepu32Pd

func M256Cvtepu32Pd(a x86.M128i) (dst x86.M256d)

M256Cvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm256_cvtepu32_pd'. Requires AVX512F.

func M256CvtpdEpu32

func M256CvtpdEpu32(a x86.M256d) (dst x86.M128i)

M256CvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm256_cvtpd_epu32'. Requires AVX512F.

func M256CvtpsEpu32

func M256CvtpsEpu32(a x86.M256) (dst x86.M256i)

M256CvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm256_cvtps_epu32'. Requires AVX512F.

func M256Cvtsepi32Epi16

func M256Cvtsepi32Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm256_cvtsepi32_epi16'. Requires AVX512F.

func M256Cvtsepi32Epi8

func M256Cvtsepi32Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm256_cvtsepi32_epi8'. Requires AVX512F.

func M256Cvtsepi64Epi16

func M256Cvtsepi64Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm256_cvtsepi64_epi16'. Requires AVX512F.

func M256Cvtsepi64Epi32

func M256Cvtsepi64Epi32(a x86.M256i) (dst x86.M128i)

M256Cvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm256_cvtsepi64_epi32'. Requires AVX512F.

func M256Cvtsepi64Epi8

func M256Cvtsepi64Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm256_cvtsepi64_epi8'. Requires AVX512F.

func M256CvttpdEpu32

func M256CvttpdEpu32(a x86.M256d) (dst x86.M128i)

M256CvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm256_cvttpd_epu32'. Requires AVX512F.

func M256CvttpsEpu32

func M256CvttpsEpu32(a x86.M256) (dst x86.M256i)

M256CvttpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm256_cvttps_epu32'. Requires AVX512F.

func M256Cvtusepi32Epi16

func M256Cvtusepi32Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm256_cvtusepi32_epi16'. Requires AVX512F.

func M256Cvtusepi32Epi8

func M256Cvtusepi32Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm256_cvtusepi32_epi8'. Requires AVX512F.

func M256Cvtusepi64Epi16

func M256Cvtusepi64Epi16(a x86.M256i) (dst x86.M128i)

M256Cvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm256_cvtusepi64_epi16'. Requires AVX512F.

func M256Cvtusepi64Epi32

func M256Cvtusepi64Epi32(a x86.M256i) (dst x86.M128i)

M256Cvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm256_cvtusepi64_epi32'. Requires AVX512F.

func M256Cvtusepi64Epi8

func M256Cvtusepi64Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm256_cvtusepi64_epi8'. Requires AVX512F.

func M256Extractf32x4Ps

func M256Extractf32x4Ps(a x86.M256, imm8 byte) (dst x86.M128)

M256Extractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm256_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256Extracti32x4Epi32

func M256Extracti32x4Epi32(a x86.M256i, imm8 byte) (dst x86.M128i)

M256Extracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm256_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256FixupimmPd

func M256FixupimmPd(a x86.M256d, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)

M256FixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm256_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256FixupimmPs

func M256FixupimmPs(a x86.M256, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)

M256FixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm256_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256GetexpPd

func M256GetexpPd(a x86.M256d) (dst x86.M256d)

M256GetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm256_getexp_pd'. Requires AVX512F.

func M256GetexpPs

func M256GetexpPs(a x86.M256) (dst x86.M256)

M256GetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm256_getexp_ps'. Requires AVX512F.

func M256GetmantPd

func M256GetmantPd(a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)

M256GetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*64
			dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm256_getmant_pd'. Requires AVX512F.

func M256GetmantPs

func M256GetmantPs(a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)

M256GetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 7
			i := j*32
			dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm256_getmant_ps'. Requires AVX512F.

func M256Insertf32x4

func M256Insertf32x4(a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)

M256Insertf32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm256_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256Inserti32x4

func M256Inserti32x4(a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256Inserti32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm256_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256Mask2Permutex2varEpi32

func M256Mask2Permutex2varEpi32(a x86.M256i, idx x86.M256i, k x86.Mmask8, b x86.M256i) (dst x86.M256i)

M256Mask2Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2D'. Intrinsic: '_mm256_mask2_permutex2var_epi32'. Requires AVX512F.

func M256Mask2Permutex2varEpi64

func M256Mask2Permutex2varEpi64(a x86.M256i, idx x86.M256i, k x86.Mmask8, b x86.M256i) (dst x86.M256i)

M256Mask2Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2Q'. Intrinsic: '_mm256_mask2_permutex2var_epi64'. Requires AVX512F.

func M256Mask2Permutex2varPd

func M256Mask2Permutex2varPd(a x86.M256d, idx x86.M256i, k x86.Mmask8, b x86.M256d) (dst x86.M256d)

M256Mask2Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PD'. Intrinsic: '_mm256_mask2_permutex2var_pd'. Requires AVX512F.

func M256Mask2Permutex2varPs

func M256Mask2Permutex2varPs(a x86.M256, idx x86.M256i, k x86.Mmask8, b x86.M256) (dst x86.M256)

M256Mask2Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PS'. Intrinsic: '_mm256_mask2_permutex2var_ps'. Requires AVX512F.

func M256Mask3FmaddPd

func M256Mask3FmaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm256_mask3_fmadd_pd'. Requires AVX512F.

func M256Mask3FmaddPs

func M256Mask3FmaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm256_mask3_fmadd_ps'. Requires AVX512F.

func M256Mask3FmaddsubPd

func M256Mask3FmaddsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm256_mask3_fmaddsub_pd'. Requires AVX512F.

func M256Mask3FmaddsubPs

func M256Mask3FmaddsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm256_mask3_fmaddsub_ps'. Requires AVX512F.

func M256Mask3FmsubPd

func M256Mask3FmsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm256_mask3_fmsub_pd'. Requires AVX512F.

func M256Mask3FmsubPs

func M256Mask3FmsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm256_mask3_fmsub_ps'. Requires AVX512F.

func M256Mask3FmsubaddPd

func M256Mask3FmsubaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm256_mask3_fmsubadd_pd'. Requires AVX512F.

func M256Mask3FmsubaddPs

func M256Mask3FmsubaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm256_mask3_fmsubadd_ps'. Requires AVX512F.

func M256Mask3FnmaddPd

func M256Mask3FnmaddPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm256_mask3_fnmadd_pd'. Requires AVX512F.

func M256Mask3FnmaddPs

func M256Mask3FnmaddPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm256_mask3_fnmadd_ps'. Requires AVX512F.

func M256Mask3FnmsubPd

func M256Mask3FnmsubPd(a x86.M256d, b x86.M256d, c x86.M256d, k x86.Mmask8) (dst x86.M256d)

M256Mask3FnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm256_mask3_fnmsub_pd'. Requires AVX512F.

func M256Mask3FnmsubPs

func M256Mask3FnmsubPs(a x86.M256, b x86.M256, c x86.M256, k x86.Mmask8) (dst x86.M256)

M256Mask3FnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm256_mask3_fnmsub_ps'. Requires AVX512F.

func M256MaskAbsEpi32

func M256MaskAbsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm256_mask_abs_epi32'. Requires AVX512F.

func M256MaskAbsEpi64

func M256MaskAbsEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm256_mask_abs_epi64'. Requires AVX512F.

func M256MaskAddEpi32

func M256MaskAddEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm256_mask_add_epi32'. Requires AVX512F.

func M256MaskAddEpi64

func M256MaskAddEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm256_mask_add_epi64'. Requires AVX512F.

func M256MaskAndEpi32

func M256MaskAndEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm256_mask_and_epi32'. Requires AVX512F.

func M256MaskAndEpi64

func M256MaskAndEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm256_mask_and_epi64'. Requires AVX512F.

func M256MaskAndnotEpi32

func M256MaskAndnotEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm256_mask_andnot_epi32'. Requires AVX512F.

func M256MaskAndnotEpi64

func M256MaskAndnotEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm256_mask_andnot_epi64'. Requires AVX512F.

func M256MaskBlendEpi32

func M256MaskBlendEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskBlendEpi32: Blend packed 32-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDMD'. Intrinsic: '_mm256_mask_blend_epi32'. Requires AVX512F.

func M256MaskBlendEpi64

func M256MaskBlendEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskBlendEpi64: Blend packed 64-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDMQ'. Intrinsic: '_mm256_mask_blend_epi64'. Requires AVX512F.

func M256MaskBlendPd

func M256MaskBlendPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskBlendPd: Blend packed double-precision (64-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBLENDMPD'. Intrinsic: '_mm256_mask_blend_pd'. Requires AVX512F.

func M256MaskBlendPs

func M256MaskBlendPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskBlendPs: Blend packed single-precision (32-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBLENDMPS'. Intrinsic: '_mm256_mask_blend_ps'. Requires AVX512F.

func M256MaskBroadcastF32x4

func M256MaskBroadcastF32x4(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm256_mask_broadcast_f32x4'. Requires AVX512F.

func M256MaskBroadcastI32x4

func M256MaskBroadcastI32x4(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm256_mask_broadcast_i32x4'. Requires AVX512F.

func M256MaskBroadcastdEpi32

func M256MaskBroadcastdEpi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_mask_broadcastd_epi32'. Requires AVX512F.

func M256MaskBroadcastqEpi64

func M256MaskBroadcastqEpi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_mask_broadcastq_epi64'. Requires AVX512F.

func M256MaskBroadcastsdPd

func M256MaskBroadcastsdPd(src x86.M256d, k x86.Mmask8, a x86.M128d) (dst x86.M256d)

M256MaskBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm256_mask_broadcastsd_pd'. Requires AVX512F.

func M256MaskBroadcastssPs

func M256MaskBroadcastssPs(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm256_mask_broadcastss_ps'. Requires AVX512F.

func M256MaskCmpEpi32Mask

func M256MaskCmpEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256MaskCmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmp_epi32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpi64Mask

func M256MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256MaskCmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpu32Mask

func M256MaskCmpEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256MaskCmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmp_epu32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpu64Mask

func M256MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask8)

M256MaskCmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpPdMask

func M256MaskCmpPdMask(k1 x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.Mmask8)

M256MaskCmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm256_mask_cmp_pd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpPsMask

func M256MaskCmpPsMask(k1 x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.Mmask8)

M256MaskCmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm256_mask_cmp_ps_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpeqEpi32Mask

func M256MaskCmpeqEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpeq_epi32_mask'. Requires AVX512F.

func M256MaskCmpeqEpi64Mask

func M256MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpeq_epi64_mask'. Requires AVX512F.

func M256MaskCmpeqEpu32Mask

func M256MaskCmpeqEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpeq_epu32_mask'. Requires AVX512F.

func M256MaskCmpeqEpu64Mask

func M256MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpeq_epu64_mask'. Requires AVX512F.

func M256MaskCmpgeEpi32Mask

func M256MaskCmpgeEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpge_epi32_mask'. Requires AVX512F.

func M256MaskCmpgeEpi64Mask

func M256MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpge_epi64_mask'. Requires AVX512F.

func M256MaskCmpgeEpu32Mask

func M256MaskCmpgeEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpge_epu32_mask'. Requires AVX512F.

func M256MaskCmpgeEpu64Mask

func M256MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpge_epu64_mask'. Requires AVX512F.

func M256MaskCmpgtEpi32Mask

func M256MaskCmpgtEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpgt_epi32_mask'. Requires AVX512F.

func M256MaskCmpgtEpi64Mask

func M256MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpgt_epi64_mask'. Requires AVX512F.

func M256MaskCmpgtEpu32Mask

func M256MaskCmpgtEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpgt_epu32_mask'. Requires AVX512F.

func M256MaskCmpgtEpu64Mask

func M256MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpgt_epu64_mask'. Requires AVX512F.

func M256MaskCmpleEpi32Mask

func M256MaskCmpleEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmple_epi32_mask'. Requires AVX512F.

func M256MaskCmpleEpi64Mask

func M256MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmple_epi64_mask'. Requires AVX512F.

func M256MaskCmpleEpu32Mask

func M256MaskCmpleEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmple_epu32_mask'. Requires AVX512F.

func M256MaskCmpleEpu64Mask

func M256MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmple_epu64_mask'. Requires AVX512F.

func M256MaskCmpltEpi32Mask

func M256MaskCmpltEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmplt_epi32_mask'. Requires AVX512F.

func M256MaskCmpltEpi64Mask

func M256MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmplt_epi64_mask'. Requires AVX512F.

func M256MaskCmpltEpu32Mask

func M256MaskCmpltEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmplt_epu32_mask'. Requires AVX512F.

func M256MaskCmpltEpu64Mask

func M256MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmplt_epu64_mask'. Requires AVX512F.

func M256MaskCmpneqEpi32Mask

func M256MaskCmpneqEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm256_mask_cmpneq_epi32_mask'. Requires AVX512F.

func M256MaskCmpneqEpi64Mask

func M256MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm256_mask_cmpneq_epi64_mask'. Requires AVX512F.

func M256MaskCmpneqEpu32Mask

func M256MaskCmpneqEpu32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm256_mask_cmpneq_epu32_mask'. Requires AVX512F.

func M256MaskCmpneqEpu64Mask

func M256MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskCmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm256_mask_cmpneq_epu64_mask'. Requires AVX512F.

func M256MaskCompressEpi32

func M256MaskCompressEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm256_mask_compress_epi32'. Requires AVX512F.

func M256MaskCompressEpi64

func M256MaskCompressEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm256_mask_compress_epi64'. Requires AVX512F.

func M256MaskCompressPd

func M256MaskCompressPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm256_mask_compress_pd'. Requires AVX512F.

func M256MaskCompressPs

func M256MaskCompressPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm256_mask_compress_ps'. Requires AVX512F.

func M256MaskCvtRoundpsPh

func M256MaskCvtRoundpsPh(src x86.M128i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)

M256MaskCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_mask_cvt_roundps_ph'. Requires AVX512F.

func M256MaskCvtepi16Epi32

func M256MaskCvtepi16Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	l := j*16
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm256_mask_cvtepi16_epi32'. Requires AVX512F.

func M256MaskCvtepi16Epi64

func M256MaskCvtepi16Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi16Epi64: Sign extend packed 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm256_mask_cvtepi16_epi64'. Requires AVX512F.

func M256MaskCvtepi32Epi16

func M256MaskCvtepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm256_mask_cvtepi32_epi16'. Requires AVX512F.

func M256MaskCvtepi32Epi64

func M256MaskCvtepi32Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm256_mask_cvtepi32_epi64'. Requires AVX512F.

func M256MaskCvtepi32Epi8

func M256MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm256_mask_cvtepi32_epi8'. Requires AVX512F.

func M256MaskCvtepi32Pd

func M256MaskCvtepi32Pd(src x86.M256d, k x86.Mmask8, a x86.M128i) (dst x86.M256d)

M256MaskCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := src[m+63:m]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm256_mask_cvtepi32_pd'. Requires AVX512F.

func M256MaskCvtepi32Ps

func M256MaskCvtepi32Ps(src x86.M256, k x86.Mmask8, a x86.M256i) (dst x86.M256)

M256MaskCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm256_mask_cvtepi32_ps'. Requires AVX512F.

func M256MaskCvtepi64Epi16

func M256MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm256_mask_cvtepi64_epi16'. Requires AVX512F.

func M256MaskCvtepi64Epi32

func M256MaskCvtepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm256_mask_cvtepi64_epi32'. Requires AVX512F.

func M256MaskCvtepi64Epi8

func M256MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm256_mask_cvtepi64_epi8'. Requires AVX512F.

func M256MaskCvtepi8Epi32

func M256MaskCvtepi8Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi8Epi32: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm256_mask_cvtepi8_epi32'. Requires AVX512F.

func M256MaskCvtepi8Epi64

func M256MaskCvtepi8Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi8Epi64: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm256_mask_cvtepi8_epi64'. Requires AVX512F.

func M256MaskCvtepu16Epi32

func M256MaskCvtepu16Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm256_mask_cvtepu16_epi32'. Requires AVX512F.

func M256MaskCvtepu16Epi64

func M256MaskCvtepu16Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm256_mask_cvtepu16_epi64'. Requires AVX512F.

func M256MaskCvtepu32Epi64

func M256MaskCvtepu32Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm256_mask_cvtepu32_epi64'. Requires AVX512F.

func M256MaskCvtepu32Pd

func M256MaskCvtepu32Pd(src x86.M256d, k x86.Mmask8, a x86.M128i) (dst x86.M256d)

M256MaskCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm256_mask_cvtepu32_pd'. Requires AVX512F.

func M256MaskCvtepu8Epi32

func M256MaskCvtepu8Epi32(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm256_mask_cvtepu8_epi32'. Requires AVX512F.

func M256MaskCvtepu8Epi64

func M256MaskCvtepu8Epi64(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm256_mask_cvtepu8_epi64'. Requires AVX512F.

func M256MaskCvtpdEpi32

func M256MaskCvtpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm256_mask_cvtpd_epi32'. Requires AVX512F.

func M256MaskCvtpdEpu32

func M256MaskCvtpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm256_mask_cvtpd_epu32'. Requires AVX512F.

func M256MaskCvtpdPs

func M256MaskCvtpdPs(src x86.M128, k x86.Mmask8, a x86.M256d) (dst x86.M128)

M256MaskCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm256_mask_cvtpd_ps'. Requires AVX512F.

func M256MaskCvtphPs

func M256MaskCvtphPs(src x86.M256, k x86.Mmask8, a x86.M128i) (dst x86.M256)

M256MaskCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm256_mask_cvtph_ps'. Requires AVX512F.

func M256MaskCvtpsEpi32

func M256MaskCvtpsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm256_mask_cvtps_epi32'. Requires AVX512F.

func M256MaskCvtpsEpu32

func M256MaskCvtpsEpu32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm256_mask_cvtps_epu32'. Requires AVX512F.

func M256MaskCvtpsPh

func M256MaskCvtpsPh(src x86.M128i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)

M256MaskCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_mask_cvtps_ph'. Requires AVX512F.

func M256MaskCvtsepi32Epi16

func M256MaskCvtsepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm256_mask_cvtsepi32_epi16'. Requires AVX512F.

func M256MaskCvtsepi32Epi8

func M256MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm256_mask_cvtsepi32_epi8'. Requires AVX512F.

func M256MaskCvtsepi64Epi16

func M256MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm256_mask_cvtsepi64_epi16'. Requires AVX512F.

func M256MaskCvtsepi64Epi32

func M256MaskCvtsepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm256_mask_cvtsepi64_epi32'. Requires AVX512F.

func M256MaskCvtsepi64Epi8

func M256MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm256_mask_cvtsepi64_epi8'. Requires AVX512F.

func M256MaskCvttpdEpi32

func M256MaskCvttpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm256_mask_cvttpd_epi32'. Requires AVX512F.

func M256MaskCvttpdEpu32

func M256MaskCvttpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm256_mask_cvttpd_epu32'. Requires AVX512F.

func M256MaskCvttpsEpi32

func M256MaskCvttpsEpi32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm256_mask_cvttps_epi32'. Requires AVX512F.

func M256MaskCvttpsEpu32

func M256MaskCvttpsEpu32(src x86.M256i, k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm256_mask_cvttps_epu32'. Requires AVX512F.

func M256MaskCvtusepi32Epi16

func M256MaskCvtusepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm256_mask_cvtusepi32_epi16'. Requires AVX512F.

func M256MaskCvtusepi32Epi8

func M256MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm256_mask_cvtusepi32_epi8'. Requires AVX512F.

func M256MaskCvtusepi64Epi16

func M256MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm256_mask_cvtusepi64_epi16'. Requires AVX512F.

func M256MaskCvtusepi64Epi32

func M256MaskCvtusepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm256_mask_cvtusepi64_epi32'. Requires AVX512F.

func M256MaskCvtusepi64Epi8

func M256MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm256_mask_cvtusepi64_epi8'. Requires AVX512F.

func M256MaskDivPd

func M256MaskDivPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm256_mask_div_pd'. Requires AVX512F.

func M256MaskDivPs

func M256MaskDivPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm256_mask_div_ps'. Requires AVX512F.

func M256MaskExpandEpi32

func M256MaskExpandEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm256_mask_expand_epi32'. Requires AVX512F.

func M256MaskExpandEpi64

func M256MaskExpandEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm256_mask_expand_epi64'. Requires AVX512F.

func M256MaskExpandPd

func M256MaskExpandPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm256_mask_expand_pd'. Requires AVX512F.

func M256MaskExpandPs

func M256MaskExpandPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm256_mask_expand_ps'. Requires AVX512F.

func M256MaskExtractf32x4Ps

func M256MaskExtractf32x4Ps(src x86.M128, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M128)

M256MaskExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm256_mask_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskExtracti32x4Epi32

func M256MaskExtracti32x4Epi32(src x86.M128i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)

M256MaskExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm256_mask_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskFixupimmPd

func M256MaskFixupimmPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)

M256MaskFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm256_mask_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskFixupimmPs

func M256MaskFixupimmPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)

M256MaskFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm256_mask_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskFmaddPd

func M256MaskFmaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm256_mask_fmadd_pd'. Requires AVX512F.

func M256MaskFmaddPs

func M256MaskFmaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm256_mask_fmadd_ps'. Requires AVX512F.

func M256MaskFmaddsubPd

func M256MaskFmaddsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm256_mask_fmaddsub_pd'. Requires AVX512F.

func M256MaskFmaddsubPs

func M256MaskFmaddsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm256_mask_fmaddsub_ps'. Requires AVX512F.

func M256MaskFmsubPd

func M256MaskFmsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm256_mask_fmsub_pd'. Requires AVX512F.

func M256MaskFmsubPs

func M256MaskFmsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm256_mask_fmsub_ps'. Requires AVX512F.

func M256MaskFmsubaddPd

func M256MaskFmsubaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm256_mask_fmsubadd_pd'. Requires AVX512F.

func M256MaskFmsubaddPs

func M256MaskFmsubaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm256_mask_fmsubadd_ps'. Requires AVX512F.

func M256MaskFnmaddPd

func M256MaskFnmaddPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm256_mask_fnmadd_pd'. Requires AVX512F.

func M256MaskFnmaddPs

func M256MaskFnmaddPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm256_mask_fnmadd_ps'. Requires AVX512F.

func M256MaskFnmsubPd

func M256MaskFnmsubPd(a x86.M256d, k x86.Mmask8, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm256_mask_fnmsub_pd'. Requires AVX512F.

func M256MaskFnmsubPs

func M256MaskFnmsubPs(a x86.M256, k x86.Mmask8, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm256_mask_fnmsub_ps'. Requires AVX512F.

func M256MaskGetexpPd

func M256MaskGetexpPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm256_mask_getexp_pd'. Requires AVX512F.

func M256MaskGetexpPs

func M256MaskGetexpPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm256_mask_getexp_ps'. Requires AVX512F.

func M256MaskGetmantPd

func M256MaskGetmantPd(src x86.M256d, k x86.Mmask8, a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)

M256MaskGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm256_mask_getmant_pd'. Requires AVX512F.

func M256MaskGetmantPs

func M256MaskGetmantPs(src x86.M256, k x86.Mmask8, a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)

M256MaskGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 7
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm256_mask_getmant_ps'. Requires AVX512F.

func M256MaskInsertf32x4

func M256MaskInsertf32x4(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)

M256MaskInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm256_mask_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskInserti32x4

func M256MaskInserti32x4(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256MaskInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm256_mask_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskMaxEpi32

func M256MaskMaxEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm256_mask_max_epi32'. Requires AVX512F.

func M256MaskMaxEpi64

func M256MaskMaxEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm256_mask_max_epi64'. Requires AVX512F.

func M256MaskMaxEpu32

func M256MaskMaxEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm256_mask_max_epu32'. Requires AVX512F.

func M256MaskMaxEpu64

func M256MaskMaxEpu64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm256_mask_max_epu64'. Requires AVX512F.

func M256MaskMaxPd

func M256MaskMaxPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm256_mask_max_pd'. Requires AVX512F.

func M256MaskMaxPs

func M256MaskMaxPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm256_mask_max_ps'. Requires AVX512F.

func M256MaskMinEpi32

func M256MaskMinEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm256_mask_min_epi32'. Requires AVX512F.

func M256MaskMinEpi64

func M256MaskMinEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm256_mask_min_epi64'. Requires AVX512F.

func M256MaskMinEpu32

func M256MaskMinEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm256_mask_min_epu32'. Requires AVX512F.

func M256MaskMinEpu64

func M256MaskMinEpu64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm256_mask_min_epu64'. Requires AVX512F.

func M256MaskMinPd

func M256MaskMinPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm256_mask_min_pd'. Requires AVX512F.

func M256MaskMinPs

func M256MaskMinPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm256_mask_min_ps'. Requires AVX512F.

func M256MaskMovEpi32

func M256MaskMovEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskMovEpi32: Move packed 32-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm256_mask_mov_epi32'. Requires AVX512F.

func M256MaskMovEpi64

func M256MaskMovEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskMovEpi64: Move packed 64-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm256_mask_mov_epi64'. Requires AVX512F.

func M256MaskMovPd

func M256MaskMovPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm256_mask_mov_pd'. Requires AVX512F.

func M256MaskMovPs

func M256MaskMovPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm256_mask_mov_ps'. Requires AVX512F.

func M256MaskMovedupPd

func M256MaskMovedupPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm256_mask_movedup_pd'. Requires AVX512F.

func M256MaskMovehdupPs

func M256MaskMovehdupPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm256_mask_movehdup_ps'. Requires AVX512F.

func M256MaskMoveldupPs

func M256MaskMoveldupPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm256_mask_moveldup_ps'. Requires AVX512F.

func M256MaskMulEpi32

func M256MaskMulEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm256_mask_mul_epi32'. Requires AVX512F.

func M256MaskMulEpu32

func M256MaskMulEpu32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm256_mask_mul_epu32'. Requires AVX512F.

func M256MaskMulPd

func M256MaskMulPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm256_mask_mul_pd'. Requires AVX512F.

func M256MaskMulPs

func M256MaskMulPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). RM.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm256_mask_mul_ps'. Requires AVX512F.

func M256MaskMulloEpi32

func M256MaskMulloEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm256_mask_mullo_epi32'. Requires AVX512F.

func M256MaskOrEpi32

func M256MaskOrEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPORD'. Intrinsic: '_mm256_mask_or_epi32'. Requires AVX512F.

func M256MaskOrEpi64

func M256MaskOrEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm256_mask_or_epi64'. Requires AVX512F.

func M256MaskPermutePd

func M256MaskPermutePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm256_mask_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskPermutePs

func M256MaskPermutePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm256_mask_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskPermutevarPd

func M256MaskPermutevarPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256i) (dst x86.M256d)

M256MaskPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm256_mask_permutevar_pd'. Requires AVX512F.

func M256MaskPermutevarPs

func M256MaskPermutevarPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256i) (dst x86.M256)

M256MaskPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm256_mask_permutevar_ps'. Requires AVX512F.

func M256MaskPermutex2varEpi32

func M256MaskPermutex2varEpi32(a x86.M256i, k x86.Mmask8, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMT2D'. Intrinsic: '_mm256_mask_permutex2var_epi32'. Requires AVX512F.

func M256MaskPermutex2varEpi64

func M256MaskPermutex2varEpi64(a x86.M256i, k x86.Mmask8, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMT2Q'. Intrinsic: '_mm256_mask_permutex2var_epi64'. Requires AVX512F.

func M256MaskPermutex2varPd

func M256MaskPermutex2varPd(a x86.M256d, k x86.Mmask8, idx x86.M256i, b x86.M256d) (dst x86.M256d)

M256MaskPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMT2PD'. Intrinsic: '_mm256_mask_permutex2var_pd'. Requires AVX512F.

func M256MaskPermutex2varPs

func M256MaskPermutex2varPs(a x86.M256, k x86.Mmask8, idx x86.M256i, b x86.M256) (dst x86.M256)

M256MaskPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMT2PS'. Intrinsic: '_mm256_mask_permutex2var_ps'. Requires AVX512F.

func M256MaskPermutexEpi64

func M256MaskPermutexEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskPermutexEpi64: Shuffle 64-bit integers in 'a' across lanes lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_mask_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskPermutexPd

func M256MaskPermutexPd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_mask_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskPermutexvarEpi32

func M256MaskPermutexvarEpi32(src x86.M256i, k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm256_mask_permutexvar_epi32'. Requires AVX512F.

func M256MaskPermutexvarEpi64

func M256MaskPermutexvarEpi64(src x86.M256i, k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_mask_permutexvar_epi64'. Requires AVX512F.

func M256MaskPermutexvarPd

func M256MaskPermutexvarPd(src x86.M256d, k x86.Mmask8, idx x86.M256i, a x86.M256d) (dst x86.M256d)

M256MaskPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_mask_permutexvar_pd'. Requires AVX512F.

func M256MaskPermutexvarPs

func M256MaskPermutexvarPs(src x86.M256, k x86.Mmask8, idx x86.M256i, a x86.M256) (dst x86.M256)

M256MaskPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm256_mask_permutexvar_ps'. Requires AVX512F.

func M256MaskRcp14Pd

func M256MaskRcp14Pd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm256_mask_rcp14_pd'. Requires AVX512F.

func M256MaskRcp14Ps

func M256MaskRcp14Ps(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm256_mask_rcp14_ps'. Requires AVX512F.

func M256MaskRolEpi32

func M256MaskRolEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm256_mask_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRolEpi64

func M256MaskRolEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm256_mask_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRolvEpi32

func M256MaskRolvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm256_mask_rolv_epi32'. Requires AVX512F.

func M256MaskRolvEpi64

func M256MaskRolvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm256_mask_rolv_epi64'. Requires AVX512F.

func M256MaskRorEpi32

func M256MaskRorEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm256_mask_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRorEpi64

func M256MaskRorEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm256_mask_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRorvEpi32

func M256MaskRorvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm256_mask_rorv_epi32'. Requires AVX512F.

func M256MaskRorvEpi64

func M256MaskRorvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm256_mask_rorv_epi64'. Requires AVX512F.

func M256MaskRoundscalePd

func M256MaskRoundscalePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm256_mask_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRoundscalePs

func M256MaskRoundscalePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm256_mask_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskRsqrt14Pd

func M256MaskRsqrt14Pd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm256_mask_rsqrt14_pd'. Requires AVX512F.

func M256MaskRsqrt14Ps

func M256MaskRsqrt14Ps(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm256_mask_rsqrt14_ps'. Requires AVX512F.

func M256MaskScalefPd

func M256MaskScalefPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm256_mask_scalef_pd'. Requires AVX512F.

func M256MaskScalefPs

func M256MaskScalefPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm256_mask_scalef_ps'. Requires AVX512F.

func M256MaskSet1Epi32

func M256MaskSet1Epi32(src x86.M256i, k x86.Mmask8, a int) (dst x86.M256i)

M256MaskSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_mask_set1_epi32'. Requires AVX512F.

func M256MaskSet1Epi64

func M256MaskSet1Epi64(src x86.M256i, k x86.Mmask8, a int64) (dst x86.M256i)

M256MaskSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_mask_set1_epi64'. Requires AVX512F.

func M256MaskShuffleEpi32

func M256MaskShuffleEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm256_mask_shuffle_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShuffleF32x4

func M256MaskShuffleF32x4(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm256_mask_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShuffleF64x2

func M256MaskShuffleF64x2(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm256_mask_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShuffleI32x4

func M256MaskShuffleI32x4(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm256_mask_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShuffleI64x2

func M256MaskShuffleI64x2(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm256_mask_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShufflePd

func M256MaskShufflePd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm256_mask_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskShufflePs

func M256MaskShufflePs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm256_mask_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSllEpi32

func M256MaskSllEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm256_mask_sll_epi32'. Requires AVX512F.

func M256MaskSllEpi64

func M256MaskSllEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm256_mask_sll_epi64'. Requires AVX512F.

func M256MaskSlliEpi32

func M256MaskSlliEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm256_mask_slli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSlliEpi64

func M256MaskSlliEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm256_mask_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSllvEpi32

func M256MaskSllvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm256_mask_sllv_epi32'. Requires AVX512F.

func M256MaskSllvEpi64

func M256MaskSllvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm256_mask_sllv_epi64'. Requires AVX512F.

func M256MaskSqrtPd

func M256MaskSqrtPd(src x86.M256d, k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm256_mask_sqrt_pd'. Requires AVX512F.

func M256MaskSqrtPs

func M256MaskSqrtPs(src x86.M256, k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm256_mask_sqrt_ps'. Requires AVX512F.

func M256MaskSraEpi32

func M256MaskSraEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm256_mask_sra_epi32'. Requires AVX512F.

func M256MaskSraEpi64

func M256MaskSraEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_mask_sra_epi64'. Requires AVX512F.

func M256MaskSraiEpi32

func M256MaskSraiEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm256_mask_srai_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSraiEpi64

func M256MaskSraiEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_mask_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSravEpi32

func M256MaskSravEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm256_mask_srav_epi32'. Requires AVX512F.

func M256MaskSravEpi64

func M256MaskSravEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm256_mask_srav_epi64'. Requires AVX512F.

func M256MaskSrlEpi32

func M256MaskSrlEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm256_mask_srl_epi32'. Requires AVX512F.

func M256MaskSrlEpi64

func M256MaskSrlEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm256_mask_srl_epi64'. Requires AVX512F.

func M256MaskSrliEpi32

func M256MaskSrliEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm256_mask_srli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSrliEpi64

func M256MaskSrliEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm256_mask_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskSrlvEpi32

func M256MaskSrlvEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm256_mask_srlv_epi32'. Requires AVX512F.

func M256MaskSrlvEpi64

func M256MaskSrlvEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm256_mask_srlv_epi64'. Requires AVX512F.

func M256MaskSubEpi32

func M256MaskSubEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm256_mask_sub_epi32'. Requires AVX512F.

func M256MaskSubEpi64

func M256MaskSubEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm256_mask_sub_epi64'. Requires AVX512F.

func M256MaskSubPd

func M256MaskSubPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm256_mask_sub_pd'. Requires AVX512F.

func M256MaskSubPs

func M256MaskSubPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm256_mask_sub_ps'. Requires AVX512F.

func M256MaskTernarylogicEpi32

func M256MaskTernarylogicEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 32-bit granularity (32-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm256_mask_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskTernarylogicEpi64

func M256MaskTernarylogicEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 64-bit granularity (64-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm256_mask_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskTestEpi32Mask

func M256MaskTestEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskTestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMD'. Intrinsic: '_mm256_mask_test_epi32_mask'. Requires AVX512F.

func M256MaskTestEpi64Mask

func M256MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskTestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm256_mask_test_epi64_mask'. Requires AVX512F.

func M256MaskTestnEpi32Mask

func M256MaskTestnEpi32Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskTestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 7
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm256_mask_testn_epi32_mask'. Requires AVX512F.

func M256MaskTestnEpi64Mask

func M256MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256MaskTestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 3
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm256_mask_testn_epi64_mask'. Requires AVX512F.

func M256MaskUnpackhiEpi32

func M256MaskUnpackhiEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm256_mask_unpackhi_epi32'. Requires AVX512F.

func M256MaskUnpackhiEpi64

func M256MaskUnpackhiEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm256_mask_unpackhi_epi64'. Requires AVX512F.

func M256MaskUnpackhiPd

func M256MaskUnpackhiPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm256_mask_unpackhi_pd'. Requires AVX512F.

func M256MaskUnpackhiPs

func M256MaskUnpackhiPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm256_mask_unpackhi_ps'. Requires AVX512F.

func M256MaskUnpackloEpi32

func M256MaskUnpackloEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm256_mask_unpacklo_epi32'. Requires AVX512F.

func M256MaskUnpackloEpi64

func M256MaskUnpackloEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm256_mask_unpacklo_epi64'. Requires AVX512F.

func M256MaskUnpackloPd

func M256MaskUnpackloPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm256_mask_unpacklo_pd'. Requires AVX512F.

func M256MaskUnpackloPs

func M256MaskUnpackloPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm256_mask_unpacklo_ps'. Requires AVX512F.

func M256MaskXorEpi32

func M256MaskXorEpi32(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm256_mask_xor_epi32'. Requires AVX512F.

func M256MaskXorEpi64

func M256MaskXorEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm256_mask_xor_epi64'. Requires AVX512F.

func M256MaskzAbsEpi32

func M256MaskzAbsEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm256_maskz_abs_epi32'. Requires AVX512F.

func M256MaskzAbsEpi64

func M256MaskzAbsEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm256_maskz_abs_epi64'. Requires AVX512F.

func M256MaskzAddEpi32

func M256MaskzAddEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm256_maskz_add_epi32'. Requires AVX512F.

func M256MaskzAddEpi64

func M256MaskzAddEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] :=0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm256_maskz_add_epi64'. Requires AVX512F.

func M256MaskzAndEpi32

func M256MaskzAndEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm256_maskz_and_epi32'. Requires AVX512F.

func M256MaskzAndEpi64

func M256MaskzAndEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm256_maskz_and_epi64'. Requires AVX512F.

func M256MaskzAndnotEpi32

func M256MaskzAndnotEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm256_maskz_andnot_epi32'. Requires AVX512F.

func M256MaskzAndnotEpi64

func M256MaskzAndnotEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm256_maskz_andnot_epi64'. Requires AVX512F.

func M256MaskzBroadcastF32x4

func M256MaskzBroadcastF32x4(k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskzBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm256_maskz_broadcast_f32x4'. Requires AVX512F.

func M256MaskzBroadcastI32x4

func M256MaskzBroadcastI32x4(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm256_maskz_broadcast_i32x4'. Requires AVX512F.

func M256MaskzBroadcastdEpi32

func M256MaskzBroadcastdEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_maskz_broadcastd_epi32'. Requires AVX512F.

func M256MaskzBroadcastqEpi64

func M256MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_maskz_broadcastq_epi64'. Requires AVX512F.

func M256MaskzBroadcastsdPd

func M256MaskzBroadcastsdPd(k x86.Mmask8, a x86.M128d) (dst x86.M256d)

M256MaskzBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm256_maskz_broadcastsd_pd'. Requires AVX512F.

func M256MaskzBroadcastssPs

func M256MaskzBroadcastssPs(k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskzBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm256_maskz_broadcastss_ps'. Requires AVX512F.

func M256MaskzCompressEpi32

func M256MaskzCompressEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm256_maskz_compress_epi32'. Requires AVX512F.

func M256MaskzCompressEpi64

func M256MaskzCompressEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm256_maskz_compress_epi64'. Requires AVX512F.

func M256MaskzCompressPd

func M256MaskzCompressPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm256_maskz_compress_pd'. Requires AVX512F.

func M256MaskzCompressPs

func M256MaskzCompressPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm256_maskz_compress_ps'. Requires AVX512F.

func M256MaskzCvtRoundpsPh

func M256MaskzCvtRoundpsPh(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)

M256MaskzCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_maskz_cvt_roundps_ph'. Requires AVX512F.

func M256MaskzCvtepi16Epi32

func M256MaskzCvtepi16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm256_maskz_cvtepi16_epi32'. Requires AVX512F.

func M256MaskzCvtepi16Epi64

func M256MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi16Epi64: Sign extend packed 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm256_maskz_cvtepi16_epi64'. Requires AVX512F.

func M256MaskzCvtepi32Epi16

func M256MaskzCvtepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm256_maskz_cvtepi32_epi16'. Requires AVX512F.

func M256MaskzCvtepi32Epi64

func M256MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm256_maskz_cvtepi32_epi64'. Requires AVX512F.

func M256MaskzCvtepi32Epi8

func M256MaskzCvtepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm256_maskz_cvtepi32_epi8'. Requires AVX512F.

func M256MaskzCvtepi32Pd

func M256MaskzCvtepi32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M256d)

M256MaskzCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm256_maskz_cvtepi32_pd'. Requires AVX512F.

func M256MaskzCvtepi32Ps

func M256MaskzCvtepi32Ps(k x86.Mmask8, a x86.M256i) (dst x86.M256)

M256MaskzCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm256_maskz_cvtepi32_ps'. Requires AVX512F.

func M256MaskzCvtepi64Epi16

func M256MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm256_maskz_cvtepi64_epi16'. Requires AVX512F.

func M256MaskzCvtepi64Epi32

func M256MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm256_maskz_cvtepi64_epi32'. Requires AVX512F.

func M256MaskzCvtepi64Epi8

func M256MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm256_maskz_cvtepi64_epi8'. Requires AVX512F.

func M256MaskzCvtepi8Epi32

func M256MaskzCvtepi8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi8Epi32: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm256_maskz_cvtepi8_epi32'. Requires AVX512F.

func M256MaskzCvtepi8Epi64

func M256MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi8Epi64: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm256_maskz_cvtepi8_epi64'. Requires AVX512F.

func M256MaskzCvtepu16Epi32

func M256MaskzCvtepu16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm256_maskz_cvtepu16_epi32'. Requires AVX512F.

func M256MaskzCvtepu16Epi64

func M256MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm256_maskz_cvtepu16_epi64'. Requires AVX512F.

func M256MaskzCvtepu32Epi64

func M256MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm256_maskz_cvtepu32_epi64'. Requires AVX512F.

func M256MaskzCvtepu32Pd

func M256MaskzCvtepu32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M256d)

M256MaskzCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm256_maskz_cvtepu32_pd'. Requires AVX512F.

func M256MaskzCvtepu8Epi32

func M256MaskzCvtepu8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm256_maskz_cvtepu8_epi32'. Requires AVX512F.

func M256MaskzCvtepu8Epi64

func M256MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm256_maskz_cvtepu8_epi64'. Requires AVX512F.

func M256MaskzCvtpdEpi32

func M256MaskzCvtpdEpi32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskzCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm256_maskz_cvtpd_epi32'. Requires AVX512F.

func M256MaskzCvtpdEpu32

func M256MaskzCvtpdEpu32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskzCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm256_maskz_cvtpd_epu32'. Requires AVX512F.

func M256MaskzCvtpdPs

func M256MaskzCvtpdPs(k x86.Mmask8, a x86.M256d) (dst x86.M128)

M256MaskzCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm256_maskz_cvtpd_ps'. Requires AVX512F.

func M256MaskzCvtphPs

func M256MaskzCvtphPs(k x86.Mmask8, a x86.M128i) (dst x86.M256)

M256MaskzCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm256_maskz_cvtph_ps'. Requires AVX512F.

func M256MaskzCvtpsEpi32

func M256MaskzCvtpsEpi32(k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskzCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm256_maskz_cvtps_epi32'. Requires AVX512F.

func M256MaskzCvtpsEpu32

func M256MaskzCvtpsEpu32(k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskzCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm256_maskz_cvtps_epu32'. Requires AVX512F.

func M256MaskzCvtpsPh

func M256MaskzCvtpsPh(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M128i)

M256MaskzCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm256_maskz_cvtps_ph'. Requires AVX512F.

func M256MaskzCvtsepi32Epi16

func M256MaskzCvtsepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm256_maskz_cvtsepi32_epi16'. Requires AVX512F.

func M256MaskzCvtsepi32Epi8

func M256MaskzCvtsepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm256_maskz_cvtsepi32_epi8'. Requires AVX512F.

func M256MaskzCvtsepi64Epi16

func M256MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm256_maskz_cvtsepi64_epi16'. Requires AVX512F.

func M256MaskzCvtsepi64Epi32

func M256MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm256_maskz_cvtsepi64_epi32'. Requires AVX512F.

func M256MaskzCvtsepi64Epi8

func M256MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm256_maskz_cvtsepi64_epi8'. Requires AVX512F.

func M256MaskzCvttpdEpi32

func M256MaskzCvttpdEpi32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskzCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm256_maskz_cvttpd_epi32'. Requires AVX512F.

func M256MaskzCvttpdEpu32

func M256MaskzCvttpdEpu32(k x86.Mmask8, a x86.M256d) (dst x86.M128i)

M256MaskzCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm256_maskz_cvttpd_epu32'. Requires AVX512F.

func M256MaskzCvttpsEpi32

func M256MaskzCvttpsEpi32(k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskzCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*i
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm256_maskz_cvttps_epi32'. Requires AVX512F.

func M256MaskzCvttpsEpu32

func M256MaskzCvttpsEpu32(k x86.Mmask8, a x86.M256) (dst x86.M256i)

M256MaskzCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm256_maskz_cvttps_epu32'. Requires AVX512F.

func M256MaskzCvtusepi32Epi16

func M256MaskzCvtusepi32Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm256_maskz_cvtusepi32_epi16'. Requires AVX512F.

func M256MaskzCvtusepi32Epi8

func M256MaskzCvtusepi32Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm256_maskz_cvtusepi32_epi8'. Requires AVX512F.

func M256MaskzCvtusepi64Epi16

func M256MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm256_maskz_cvtusepi64_epi16'. Requires AVX512F.

func M256MaskzCvtusepi64Epi32

func M256MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm256_maskz_cvtusepi64_epi32'. Requires AVX512F.

func M256MaskzCvtusepi64Epi8

func M256MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm256_maskz_cvtusepi64_epi8'. Requires AVX512F.

func M256MaskzDivPd

func M256MaskzDivPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm256_maskz_div_pd'. Requires AVX512F.

func M256MaskzDivPs

func M256MaskzDivPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm256_maskz_div_ps'. Requires AVX512F.

func M256MaskzExpandEpi32

func M256MaskzExpandEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm256_maskz_expand_epi32'. Requires AVX512F.

func M256MaskzExpandEpi64

func M256MaskzExpandEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm256_maskz_expand_epi64'. Requires AVX512F.

func M256MaskzExpandPd

func M256MaskzExpandPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm256_maskz_expand_pd'. Requires AVX512F.

func M256MaskzExpandPs

func M256MaskzExpandPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm256_maskz_expand_ps'. Requires AVX512F.

func M256MaskzExtractf32x4Ps

func M256MaskzExtractf32x4Ps(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M128)

M256MaskzExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm256_maskz_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzExtracti32x4Epi32

func M256MaskzExtracti32x4Epi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)

M256MaskzExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm256_maskz_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzFixupimmPd

func M256MaskzFixupimmPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256i, imm8 byte) (dst x86.M256d)

M256MaskzFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm256_maskz_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzFixupimmPs

func M256MaskzFixupimmPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256i, imm8 byte) (dst x86.M256)

M256MaskzFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm256_maskz_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzFmaddPd

func M256MaskzFmaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm256_maskz_fmadd_pd'. Requires AVX512F.

func M256MaskzFmaddPs

func M256MaskzFmaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm256_maskz_fmadd_ps'. Requires AVX512F.

func M256MaskzFmaddsubPd

func M256MaskzFmaddsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm256_maskz_fmaddsub_pd'. Requires AVX512F.

func M256MaskzFmaddsubPs

func M256MaskzFmaddsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm256_maskz_fmaddsub_ps'. Requires AVX512F.

func M256MaskzFmsubPd

func M256MaskzFmsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm256_maskz_fmsub_pd'. Requires AVX512F.

func M256MaskzFmsubPs

func M256MaskzFmsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm256_maskz_fmsub_ps'. Requires AVX512F.

func M256MaskzFmsubaddPd

func M256MaskzFmsubaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm256_maskz_fmsubadd_pd'. Requires AVX512F.

func M256MaskzFmsubaddPs

func M256MaskzFmsubaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm256_maskz_fmsubadd_ps'. Requires AVX512F.

func M256MaskzFnmaddPd

func M256MaskzFnmaddPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm256_maskz_fnmadd_pd'. Requires AVX512F.

func M256MaskzFnmaddPs

func M256MaskzFnmaddPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm256_maskz_fnmadd_ps'. Requires AVX512F.

func M256MaskzFnmsubPd

func M256MaskzFnmsubPd(k x86.Mmask8, a x86.M256d, b x86.M256d, c x86.M256d) (dst x86.M256d)

M256MaskzFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm256_maskz_fnmsub_pd'. Requires AVX512F.

func M256MaskzFnmsubPs

func M256MaskzFnmsubPs(k x86.Mmask8, a x86.M256, b x86.M256, c x86.M256) (dst x86.M256)

M256MaskzFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm256_maskz_fnmsub_ps'. Requires AVX512F.

func M256MaskzGetexpPd

func M256MaskzGetexpPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm256_maskz_getexp_pd'. Requires AVX512F.

func M256MaskzGetexpPs

func M256MaskzGetexpPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm256_maskz_getexp_ps'. Requires AVX512F.

func M256MaskzGetmantPd

func M256MaskzGetmantPd(k x86.Mmask8, a x86.M256d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256d)

M256MaskzGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm256_maskz_getmant_pd'. Requires AVX512F.

func M256MaskzGetmantPs

func M256MaskzGetmantPs(k x86.Mmask8, a x86.M256, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M256)

M256MaskzGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 7
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm256_maskz_getmant_ps'. Requires AVX512F.

func M256MaskzInsertf32x4

func M256MaskzInsertf32x4(k x86.Mmask8, a x86.M256, b x86.M128, imm8 byte) (dst x86.M256)

M256MaskzInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm256_maskz_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzInserti32x4

func M256MaskzInserti32x4(k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256MaskzInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm256_maskz_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzMaxEpi32

func M256MaskzMaxEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm256_maskz_max_epi32'. Requires AVX512F.

func M256MaskzMaxEpi64

func M256MaskzMaxEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm256_maskz_max_epi64'. Requires AVX512F.

func M256MaskzMaxEpu32

func M256MaskzMaxEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm256_maskz_max_epu32'. Requires AVX512F.

func M256MaskzMaxEpu64

func M256MaskzMaxEpu64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm256_maskz_max_epu64'. Requires AVX512F.

func M256MaskzMaxPd

func M256MaskzMaxPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm256_maskz_max_pd'. Requires AVX512F.

func M256MaskzMaxPs

func M256MaskzMaxPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm256_maskz_max_ps'. Requires AVX512F.

func M256MaskzMinEpi32

func M256MaskzMinEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm256_maskz_min_epi32'. Requires AVX512F.

func M256MaskzMinEpi64

func M256MaskzMinEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm256_maskz_min_epi64'. Requires AVX512F.

func M256MaskzMinEpu32

func M256MaskzMinEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm256_maskz_min_epu32'. Requires AVX512F.

func M256MaskzMinEpu64

func M256MaskzMinEpu64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm256_maskz_min_epu64'. Requires AVX512F.

func M256MaskzMinPd

func M256MaskzMinPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm256_maskz_min_pd'. Requires AVX512F.

func M256MaskzMinPs

func M256MaskzMinPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm256_maskz_min_ps'. Requires AVX512F.

func M256MaskzMovEpi32

func M256MaskzMovEpi32(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzMovEpi32: Move packed 32-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm256_maskz_mov_epi32'. Requires AVX512F.

func M256MaskzMovEpi64

func M256MaskzMovEpi64(k x86.Mmask8, a x86.M256i) (dst x86.M256i)

M256MaskzMovEpi64: Move packed 64-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm256_maskz_mov_epi64'. Requires AVX512F.

func M256MaskzMovPd

func M256MaskzMovPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm256_maskz_mov_pd'. Requires AVX512F.

func M256MaskzMovPs

func M256MaskzMovPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm256_maskz_mov_ps'. Requires AVX512F.

func M256MaskzMovedupPd

func M256MaskzMovedupPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm256_maskz_movedup_pd'. Requires AVX512F.

func M256MaskzMovehdupPs

func M256MaskzMovehdupPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm256_maskz_movehdup_ps'. Requires AVX512F.

func M256MaskzMoveldupPs

func M256MaskzMoveldupPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm256_maskz_moveldup_ps'. Requires AVX512F.

func M256MaskzMulEpi32

func M256MaskzMulEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm256_maskz_mul_epi32'. Requires AVX512F.

func M256MaskzMulEpu32

func M256MaskzMulEpu32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm256_maskz_mul_epu32'. Requires AVX512F.

func M256MaskzMulPd

func M256MaskzMulPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm256_maskz_mul_pd'. Requires AVX512F.

func M256MaskzMulPs

func M256MaskzMulPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm256_maskz_mul_ps'. Requires AVX512F.

func M256MaskzMulloEpi32

func M256MaskzMulloEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm256_maskz_mullo_epi32'. Requires AVX512F.

func M256MaskzOrEpi32

func M256MaskzOrEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPORD'. Intrinsic: '_mm256_maskz_or_epi32'. Requires AVX512F.

func M256MaskzOrEpi64

func M256MaskzOrEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm256_maskz_or_epi64'. Requires AVX512F.

func M256MaskzPermutePd

func M256MaskzPermutePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm256_maskz_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzPermutePs

func M256MaskzPermutePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskzPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm256_maskz_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzPermutevarPd

func M256MaskzPermutevarPd(k x86.Mmask8, a x86.M256d, b x86.M256i) (dst x86.M256d)

M256MaskzPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm256_maskz_permutevar_pd'. Requires AVX512F.

func M256MaskzPermutevarPs

func M256MaskzPermutevarPs(k x86.Mmask8, a x86.M256, b x86.M256i) (dst x86.M256)

M256MaskzPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm256_maskz_permutevar_ps'. Requires AVX512F.

func M256MaskzPermutex2varEpi32

func M256MaskzPermutex2varEpi32(k x86.Mmask8, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm256_maskz_permutex2var_epi32'. Requires AVX512F.

func M256MaskzPermutex2varEpi64

func M256MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm256_maskz_permutex2var_epi64'. Requires AVX512F.

func M256MaskzPermutex2varPd

func M256MaskzPermutex2varPd(k x86.Mmask8, a x86.M256d, idx x86.M256i, b x86.M256d) (dst x86.M256d)

M256MaskzPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm256_maskz_permutex2var_pd'. Requires AVX512F.

func M256MaskzPermutex2varPs

func M256MaskzPermutex2varPs(k x86.Mmask8, a x86.M256, idx x86.M256i, b x86.M256) (dst x86.M256)

M256MaskzPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm256_maskz_permutex2var_ps'. Requires AVX512F.

func M256MaskzPermutexEpi64

func M256MaskzPermutexEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzPermutexEpi64: Shuffle 64-bit integers in 'a' across lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_maskz_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzPermutexPd

func M256MaskzPermutexPd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_maskz_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzPermutexvarEpi32

func M256MaskzPermutexvarEpi32(k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskzPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm256_maskz_permutexvar_epi32'. Requires AVX512F.

func M256MaskzPermutexvarEpi64

func M256MaskzPermutexvarEpi64(k x86.Mmask8, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskzPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_maskz_permutexvar_epi64'. Requires AVX512F.

func M256MaskzPermutexvarPd

func M256MaskzPermutexvarPd(k x86.Mmask8, idx x86.M256i, a x86.M256d) (dst x86.M256d)

M256MaskzPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_maskz_permutexvar_pd'. Requires AVX512F.

func M256MaskzPermutexvarPs

func M256MaskzPermutexvarPs(k x86.Mmask8, idx x86.M256i, a x86.M256) (dst x86.M256)

M256MaskzPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm256_maskz_permutexvar_ps'. Requires AVX512F.

func M256MaskzRcp14Pd

func M256MaskzRcp14Pd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm256_maskz_rcp14_pd'. Requires AVX512F.

func M256MaskzRcp14Ps

func M256MaskzRcp14Ps(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm256_maskz_rcp14_ps'. Requires AVX512F.

func M256MaskzRolEpi32

func M256MaskzRolEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm256_maskz_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRolEpi64

func M256MaskzRolEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm256_maskz_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRolvEpi32

func M256MaskzRolvEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm256_maskz_rolv_epi32'. Requires AVX512F.

func M256MaskzRolvEpi64

func M256MaskzRolvEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm256_maskz_rolv_epi64'. Requires AVX512F.

func M256MaskzRorEpi32

func M256MaskzRorEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm256_maskz_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRorEpi64

func M256MaskzRorEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm256_maskz_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRorvEpi32

func M256MaskzRorvEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm256_maskz_rorv_epi32'. Requires AVX512F.

func M256MaskzRorvEpi64

func M256MaskzRorvEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm256_maskz_rorv_epi64'. Requires AVX512F.

func M256MaskzRoundscalePd

func M256MaskzRoundscalePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm256_maskz_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRoundscalePs

func M256MaskzRoundscalePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskzRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm256_maskz_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzRsqrt14Pd

func M256MaskzRsqrt14Pd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm256_maskz_rsqrt14_pd'. Requires AVX512F.

func M256MaskzRsqrt14Ps

func M256MaskzRsqrt14Ps(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm256_maskz_rsqrt14_ps'. Requires AVX512F.

func M256MaskzScalefPd

func M256MaskzScalefPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm256_maskz_scalef_pd'. Requires AVX512F.

func M256MaskzScalefPs

func M256MaskzScalefPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm256_maskz_scalef_ps'. Requires AVX512F.

func M256MaskzSet1Epi32

func M256MaskzSet1Epi32(k x86.Mmask8, a int) (dst x86.M256i)

M256MaskzSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_maskz_set1_epi32'. Requires AVX512F.

func M256MaskzSet1Epi64

func M256MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M256i)

M256MaskzSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_maskz_set1_epi64'. Requires AVX512F.

func M256MaskzShuffleEpi32

func M256MaskzShuffleEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm256_maskz_shuffle_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShuffleF32x4

func M256MaskzShuffleF32x4(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskzShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm256_maskz_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShuffleF64x2

func M256MaskzShuffleF64x2(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm256_maskz_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShuffleI32x4

func M256MaskzShuffleI32x4(k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm256_maskz_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShuffleI64x2

func M256MaskzShuffleI64x2(k x86.Mmask8, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm256_maskz_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShufflePd

func M256MaskzShufflePd(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm256_maskz_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzShufflePs

func M256MaskzShufflePs(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskzShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm256_maskz_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSllEpi32

func M256MaskzSllEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm256_maskz_sll_epi32'. Requires AVX512F.

func M256MaskzSllEpi64

func M256MaskzSllEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm256_maskz_sll_epi64'. Requires AVX512F.

func M256MaskzSlliEpi32

func M256MaskzSlliEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm256_maskz_slli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSlliEpi64

func M256MaskzSlliEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm256_maskz_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSllvEpi32

func M256MaskzSllvEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm256_maskz_sllv_epi32'. Requires AVX512F.

func M256MaskzSllvEpi64

func M256MaskzSllvEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm256_maskz_sllv_epi64'. Requires AVX512F.

func M256MaskzSqrtPd

func M256MaskzSqrtPd(k x86.Mmask8, a x86.M256d) (dst x86.M256d)

M256MaskzSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm256_maskz_sqrt_pd'. Requires AVX512F.

func M256MaskzSqrtPs

func M256MaskzSqrtPs(k x86.Mmask8, a x86.M256) (dst x86.M256)

M256MaskzSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm256_maskz_sqrt_ps'. Requires AVX512F.

func M256MaskzSraEpi32

func M256MaskzSraEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm256_maskz_sra_epi32'. Requires AVX512F.

func M256MaskzSraEpi64

func M256MaskzSraEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_maskz_sra_epi64'. Requires AVX512F.

func M256MaskzSraiEpi32

func M256MaskzSraiEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm256_maskz_srai_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSraiEpi64

func M256MaskzSraiEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_maskz_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSravEpi32

func M256MaskzSravEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm256_maskz_srav_epi32'. Requires AVX512F.

func M256MaskzSravEpi64

func M256MaskzSravEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm256_maskz_srav_epi64'. Requires AVX512F.

func M256MaskzSrlEpi32

func M256MaskzSrlEpi32(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm256_maskz_srl_epi32'. Requires AVX512F.

func M256MaskzSrlEpi64

func M256MaskzSrlEpi64(k x86.Mmask8, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm256_maskz_srl_epi64'. Requires AVX512F.

func M256MaskzSrliEpi32

func M256MaskzSrliEpi32(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm256_maskz_srli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSrliEpi64

func M256MaskzSrliEpi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm256_maskz_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzSrlvEpi32

func M256MaskzSrlvEpi32(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm256_maskz_srlv_epi32'. Requires AVX512F.

func M256MaskzSrlvEpi64

func M256MaskzSrlvEpi64(k x86.Mmask8, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm256_maskz_srlv_epi64'. Requires AVX512F.

func M256MaskzSubEpi32

func M256MaskzSubEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm256_maskz_sub_epi32'. Requires AVX512F.

func M256MaskzSubEpi64

func M256MaskzSubEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm256_maskz_sub_epi64'. Requires AVX512F.

func M256MaskzSubPd

func M256MaskzSubPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm256_maskz_sub_pd'. Requires AVX512F.

func M256MaskzSubPs

func M256MaskzSubPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm256_maskz_sub_ps'. Requires AVX512F.

func M256MaskzTernarylogicEpi32

func M256MaskzTernarylogicEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm256_maskz_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzTernarylogicEpi64

func M256MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm256_maskz_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256MaskzUnpackhiEpi32

func M256MaskzUnpackhiEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm256_maskz_unpackhi_epi32'. Requires AVX512F.

func M256MaskzUnpackhiEpi64

func M256MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm256_maskz_unpackhi_epi64'. Requires AVX512F.

func M256MaskzUnpackhiPd

func M256MaskzUnpackhiPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm256_maskz_unpackhi_pd'. Requires AVX512F.

func M256MaskzUnpackhiPs

func M256MaskzUnpackhiPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm256_maskz_unpackhi_ps'. Requires AVX512F.

func M256MaskzUnpackloEpi32

func M256MaskzUnpackloEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm256_maskz_unpacklo_epi32'. Requires AVX512F.

func M256MaskzUnpackloEpi64

func M256MaskzUnpackloEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm256_maskz_unpacklo_epi64'. Requires AVX512F.

func M256MaskzUnpackloPd

func M256MaskzUnpackloPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm256_maskz_unpacklo_pd'. Requires AVX512F.

func M256MaskzUnpackloPs

func M256MaskzUnpackloPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm256_maskz_unpacklo_ps'. Requires AVX512F.

func M256MaskzXorEpi32

func M256MaskzXorEpi32(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm256_maskz_xor_epi32'. Requires AVX512F.

func M256MaskzXorEpi64

func M256MaskzXorEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm256_maskz_xor_epi64'. Requires AVX512F.

func M256MaxEpi64

func M256MaxEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm256_max_epi64'. Requires AVX512F.

func M256MaxEpu64

func M256MaxEpu64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm256_max_epu64'. Requires AVX512F.

func M256MinEpi64

func M256MinEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm256_min_epi64'. Requires AVX512F.

func M256MinEpu64

func M256MinEpu64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm256_min_epu64'. Requires AVX512F.

func M256Permutex2varEpi32

func M256Permutex2varEpi32(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm256_permutex2var_epi32'. Requires AVX512F.

func M256Permutex2varEpi64

func M256Permutex2varEpi64(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm256_permutex2var_epi64'. Requires AVX512F.

func M256Permutex2varPd

func M256Permutex2varPd(a x86.M256d, idx x86.M256i, b x86.M256d) (dst x86.M256d)

M256Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	off := idx[i+1:i]*64
	dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm256_permutex2var_pd'. Requires AVX512F.

func M256Permutex2varPs

func M256Permutex2varPs(a x86.M256, idx x86.M256i, b x86.M256) (dst x86.M256)

M256Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	off := idx[i+2:i]*32
	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm256_permutex2var_ps'. Requires AVX512F.

func M256PermutexEpi64

func M256PermutexEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256PermutexEpi64: Shuffle 64-bit integers in 'a' across lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256PermutexPd

func M256PermutexPd(a x86.M256d, imm8 byte) (dst x86.M256d)

M256PermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256PermutexvarEpi32

func M256PermutexvarEpi32(idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256PermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm256_permutexvar_epi32'. Requires AVX512F.

func M256PermutexvarEpi64

func M256PermutexvarEpi64(idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256PermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_permutexvar_epi64'. Requires AVX512F.

func M256PermutexvarPd

func M256PermutexvarPd(idx x86.M256i, a x86.M256d) (dst x86.M256d)

M256PermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	id := idx[i+1:i]*64
	dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_permutexvar_pd'. Requires AVX512F.

func M256PermutexvarPs

func M256PermutexvarPs(idx x86.M256i, a x86.M256) (dst x86.M256)

M256PermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx'.

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm256_permutexvar_ps'. Requires AVX512F.

func M256Rcp14Pd

func M256Rcp14Pd(a x86.M256d) (dst x86.M256d)

M256Rcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm256_rcp14_pd'. Requires AVX512F.

func M256Rcp14Ps

func M256Rcp14Ps(a x86.M256) (dst x86.M256)

M256Rcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm256_rcp14_ps'. Requires AVX512F.

func M256RolEpi32

func M256RolEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)

M256RolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm256_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256RolEpi64

func M256RolEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256RolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm256_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256RolvEpi32

func M256RolvEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256RolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm256_rolv_epi32'. Requires AVX512F.

func M256RolvEpi64

func M256RolvEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256RolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm256_rolv_epi64'. Requires AVX512F.

func M256RorEpi32

func M256RorEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)

M256RorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm256_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256RorEpi64

func M256RorEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256RorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm256_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256RorvEpi32

func M256RorvEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256RorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm256_rorv_epi32'. Requires AVX512F.

func M256RorvEpi64

func M256RorvEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256RorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm256_rorv_epi64'. Requires AVX512F.

func M256RoundscalePd

func M256RoundscalePd(a x86.M256d, imm8 byte) (dst x86.M256d)

M256RoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm256_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256RoundscalePs

func M256RoundscalePs(a x86.M256, imm8 byte) (dst x86.M256)

M256RoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm256_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256ScalefPd

func M256ScalefPd(a x86.M256d, b x86.M256d) (dst x86.M256d)

M256ScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm256_scalef_pd'. Requires AVX512F.

func M256ScalefPs

func M256ScalefPs(a x86.M256, b x86.M256) (dst x86.M256)

M256ScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm256_scalef_ps'. Requires AVX512F.

func M256ShuffleF32x4

func M256ShuffleF32x4(a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256ShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT2(a[255:0], imm8[0])
dst[255:128] := SELECT2(b[255:0], imm8[1])
dst[MAX:256] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm256_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256ShuffleF64x2

func M256ShuffleF64x2(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256ShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT2(a[255:0], imm8[0])
dst[255:128] := SELECT2(b[255:0], imm8[1])
dst[MAX:256] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm256_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256ShuffleI32x4

func M256ShuffleI32x4(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256ShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT2(a[255:0], imm8[1:0])
dst[255:128] := SELECT2(b[255:0], imm8[3:2])
dst[MAX:256] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm256_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256ShuffleI64x2

func M256ShuffleI64x2(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256ShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT2(src, control){
	CASE(control[0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT2(a[255:0], imm8[1:0])
dst[255:128] := SELECT2(b[255:0], imm8[3:2])
dst[MAX:256] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm256_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256SraEpi64

func M256SraEpi64(a x86.M256i, count x86.M128i) (dst x86.M256i)

M256SraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_sra_epi64'. Requires AVX512F.

func M256SraiEpi64

func M256SraiEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm256_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256SravEpi64

func M256SravEpi64(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm256_srav_epi64'. Requires AVX512F.

func M256TernarylogicEpi32

func M256TernarylogicEpi32(a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)

M256TernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 7
	i := j*32
	FOR h := 0 to 31
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm256_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256TernarylogicEpi64

func M256TernarylogicEpi64(a x86.M256i, b x86.M256i, c x86.M256i, imm8 byte) (dst x86.M256i)

M256TernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 3
	i := j*64
	FOR h := 0 to 63
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm256_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M256TestEpi32Mask

func M256TestEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256TestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*32
	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMD'. Intrinsic: '_mm256_test_epi32_mask'. Requires AVX512F.

func M256TestEpi64Mask

func M256TestEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256TestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 3
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm256_test_epi64_mask'. Requires AVX512F.

func M256TestnEpi32Mask

func M256TestnEpi32Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256TestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 7
	i := j*32
	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm256_testn_epi32_mask'. Requires AVX512F.

func M256TestnEpi64Mask

func M256TestnEpi64Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask8)

M256TestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 3
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm256_testn_epi64_mask'. Requires AVX512F.

func M512AbsEpi32

func M512AbsEpi32(a x86.M512i) (dst x86.M512i)

M512AbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm512_abs_epi32'. Requires AVX512F.

func M512AbsEpi64

func M512AbsEpi64(a x86.M512i) (dst x86.M512i)

M512AbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm512_abs_epi64'. Requires AVX512F.

func M512AcosPd

func M512AcosPd(a x86.M512d) (dst x86.M512d)

M512AcosPd: Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_acos_pd'. Requires AVX512F.

func M512AcosPs

func M512AcosPs(a x86.M512) (dst x86.M512)

M512AcosPs: Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_acos_ps'. Requires AVX512F.

func M512AcoshPd

func M512AcoshPd(a x86.M512d) (dst x86.M512d)

M512AcoshPd: Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_acosh_pd'. Requires AVX512F.

func M512AcoshPs

func M512AcoshPs(a x86.M512) (dst x86.M512)

M512AcoshPs: Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_acosh_ps'. Requires AVX512F.

func M512AddEpi64

func M512AddEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm512_add_epi64'. Requires AVX512F.

func M512AlignrEpi64

func M512AlignrEpi64(a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512AlignrEpi64: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 64-bit elements, and store the low 64 bytes (8 elements) in 'dst'.

temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (64*count)
dst[511:0] := temp[511:0]
dst[MAX:512] := 0

Instruction: 'VALIGNQ'. Intrinsic: '_mm512_alignr_epi64'. Requires AVX512F.

func M512AsinPd

func M512AsinPd(a x86.M512d) (dst x86.M512d)

M512AsinPd: Compute the inverse sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ASIN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_asin_pd'. Requires AVX512F.

func M512AsinPs

func M512AsinPs(a x86.M512) (dst x86.M512)

M512AsinPs: Compute the inverse sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ASIN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_asin_ps'. Requires AVX512F.

func M512AsinhPd

func M512AsinhPd(a x86.M512d) (dst x86.M512d)

M512AsinhPd: Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ASINH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_asinh_pd'. Requires AVX512F.

func M512AsinhPs

func M512AsinhPs(a x86.M512) (dst x86.M512)

M512AsinhPs: Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ASINH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_asinh_ps'. Requires AVX512F.

func M512Atan2Pd

func M512Atan2Pd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512Atan2Pd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atan2_pd'. Requires AVX512F.

func M512Atan2Ps

func M512Atan2Ps(a x86.M512, b x86.M512) (dst x86.M512)

M512Atan2Ps: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atan2_ps'. Requires AVX512F.

func M512AtanPd

func M512AtanPd(a x86.M512d) (dst x86.M512d)

M512AtanPd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' and store the results in 'dst' expressed in radians.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ATAN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atan_pd'. Requires AVX512F.

func M512AtanPs

func M512AtanPs(a x86.M512) (dst x86.M512)

M512AtanPs: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ATAN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atan_ps'. Requires AVX512F.

func M512AtanhPd

func M512AtanhPd(a x86.M512d) (dst x86.M512d)

M512AtanhPd: Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' and store the results in 'dst' expressed in radians.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ATANH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atanh_pd'. Requires AVX512F.

func M512AtanhPs

func M512AtanhPs(a x86.M512) (dst x86.M512)

M512AtanhPs: Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ATANH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_atanh_ps'. Requires AVX512F.

func M512BroadcastF32x4

func M512BroadcastF32x4(a x86.M128) (dst x86.M512)

M512BroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm512_broadcast_f32x4'. Requires AVX512F.

func M512BroadcastF64x4

func M512BroadcastF64x4(a x86.M256d) (dst x86.M512d)

M512BroadcastF64x4: Broadcast the 4 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X4'. Intrinsic: '_mm512_broadcast_f64x4'. Requires AVX512F.

func M512BroadcastI32x4

func M512BroadcastI32x4(a x86.M128i) (dst x86.M512i)

M512BroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm512_broadcast_i32x4'. Requires AVX512F.

func M512BroadcastI64x4

func M512BroadcastI64x4(a x86.M256i) (dst x86.M512i)

M512BroadcastI64x4: Broadcast the 4 packed 64-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X4'. Intrinsic: '_mm512_broadcast_i64x4'. Requires AVX512F.

func M512BroadcastdEpi32

func M512BroadcastdEpi32(a x86.M128i) (dst x86.M512i)

M512BroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_broadcastd_epi32'. Requires AVX512F.

func M512BroadcastqEpi64

func M512BroadcastqEpi64(a x86.M128i) (dst x86.M512i)

M512BroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_broadcastq_epi64'. Requires AVX512F.

func M512BroadcastsdPd

func M512BroadcastsdPd(a x86.M128d) (dst x86.M512d)

M512BroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm512_broadcastsd_pd'. Requires AVX512F.

func M512BroadcastssPs

func M512BroadcastssPs(a x86.M128) (dst x86.M512)

M512BroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm512_broadcastss_ps'. Requires AVX512F.

func M512Castpd128Pd512

func M512Castpd128Pd512(a x86.M128d) (dst x86.M512d)

M512Castpd128Pd512: Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castpd128_pd512'. Requires AVX512F.

func M512Castpd256Pd512

func M512Castpd256Pd512(a x86.M256d) (dst x86.M512d)

M512Castpd256Pd512: Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castpd256_pd512'. Requires AVX512F.

func M512Castpd512Pd128

func M512Castpd512Pd128(a x86.M512d) (dst x86.M128d)

M512Castpd512Pd128: Cast vector of type __m512d to type __m128d.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castpd512_pd128'. Requires AVX512F.

func M512Castpd512Pd256

func M512Castpd512Pd256(a x86.M512d) (dst x86.M256d)

M512Castpd512Pd256: Cast vector of type __m512d to type __m256d.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castpd512_pd256'. Requires AVX512F.

func M512Castps128Ps512

func M512Castps128Ps512(a x86.M128) (dst x86.M512)

M512Castps128Ps512: Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castps128_ps512'. Requires AVX512F.

func M512Castps256Ps512

func M512Castps256Ps512(a x86.M256) (dst x86.M512)

M512Castps256Ps512: Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castps256_ps512'. Requires AVX512F.

func M512Castps512Ps128

func M512Castps512Ps128(a x86.M512) (dst x86.M128)

M512Castps512Ps128: Cast vector of type __m512 to type __m128.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castps512_ps128'. Requires AVX512F.

func M512Castps512Ps256

func M512Castps512Ps256(a x86.M512) (dst x86.M256)

M512Castps512Ps256: Cast vector of type __m512 to type __m256.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castps512_ps256'. Requires AVX512F.

func M512Castsi128Si512

func M512Castsi128Si512(a x86.M128i) (dst x86.M512i)

M512Castsi128Si512: Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castsi128_si512'. Requires AVX512F.

func M512Castsi256Si512

func M512Castsi256Si512(a x86.M256i) (dst x86.M512i)

M512Castsi256Si512: Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castsi256_si512'. Requires AVX512F.

func M512Castsi512Si128

func M512Castsi512Si128(a x86.M512i) (dst x86.M128i)

M512Castsi512Si128: Cast vector of type __m512i to type __m128i.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castsi512_si128'. Requires AVX512F.

func M512Castsi512Si256

func M512Castsi512Si256(a x86.M512i) (dst x86.M256i)

M512Castsi512Si256: Cast vector of type __m512i to type __m256i.

This intrinsic is only used for compilation and does not generate any

instructions, thus it has zero latency.

Instruction: ”. Intrinsic: '_mm512_castsi512_si256'. Requires AVX512F.

func M512CbrtPd

func M512CbrtPd(a x86.M512d) (dst x86.M512d)

M512CbrtPd: Compute the cube root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := CubeRoot(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cbrt_pd'. Requires AVX512F.

func M512CbrtPs

func M512CbrtPs(a x86.M512) (dst x86.M512)

M512CbrtPs: Compute the cube root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := CubeRoot(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cbrt_ps'. Requires AVX512F.

func M512CdfnormPd

func M512CdfnormPd(a x86.M512d) (dst x86.M512d)

M512CdfnormPd: Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := CDFNormal(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cdfnorm_pd'. Requires AVX512F.

func M512CdfnormPs

func M512CdfnormPs(a x86.M512) (dst x86.M512)

M512CdfnormPs: Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := CDFNormal(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cdfnorm_ps'. Requires AVX512F.

func M512CdfnorminvPd

func M512CdfnorminvPd(a x86.M512d) (dst x86.M512d)

M512CdfnorminvPd: Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cdfnorminv_pd'. Requires AVX512F.

func M512CdfnorminvPs

func M512CdfnorminvPs(a x86.M512) (dst x86.M512)

M512CdfnorminvPs: Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cdfnorminv_ps'. Requires AVX512F.

func M512CeilPd

func M512CeilPd(a x86.M512d) (dst x86.M512d)

M512CeilPd: Round the packed double-precision (64-bit) floating-point elements in 'a' up to an integer value, and store the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_ceil_pd'. Requires AVX512F.

func M512CeilPs

func M512CeilPs(a x86.M512) (dst x86.M512)

M512CeilPs: Round the packed single-precision (32-bit) floating-point elements in 'a' up to an integer value, and store the results as packed single-precision floating-point elements in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_ceil_ps'. Requires AVX512F.

func M512CmpEpi64Mask

func M512CmpEpi64Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)

M512CmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512CmpEpu64Mask

func M512CmpEpu64Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)

M512CmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512CmpeqEpi64Mask

func M512CmpeqEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPEQQ'. Intrinsic: '_mm512_cmpeq_epi64_mask'. Requires AVX512F.

func M512CmpeqEpu64Mask

func M512CmpeqEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpeq_epu64_mask'. Requires AVX512F.

func M512CmpgeEpi64Mask

func M512CmpgeEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmpge_epi64_mask'. Requires AVX512F.

func M512CmpgeEpu64Mask

func M512CmpgeEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpge_epu64_mask'. Requires AVX512F.

func M512CmpgtEpi64Mask

func M512CmpgtEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPGTQ'. Intrinsic: '_mm512_cmpgt_epi64_mask'. Requires AVX512F.

func M512CmpgtEpu64Mask

func M512CmpgtEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpgt_epu64_mask'. Requires AVX512F.

func M512CmpleEpi64Mask

func M512CmpleEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmple_epi64_mask'. Requires AVX512F.

func M512CmpleEpu64Mask

func M512CmpleEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmple_epu64_mask'. Requires AVX512F.

func M512CmpltEpi32Mask

func M512CmpltEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512CmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*32
	k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_cmplt_epi32_mask'. Requires AVX512F.

func M512CmpltEpi64Mask

func M512CmpltEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmplt_epi64_mask'. Requires AVX512F.

func M512CmpltEpu64Mask

func M512CmpltEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmplt_epu64_mask'. Requires AVX512F.

func M512CmpneqEpi64Mask

func M512CmpneqEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_cmpneq_epi64_mask'. Requires AVX512F.

func M512CmpneqEpu64Mask

func M512CmpneqEpu64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512CmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*64
	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_cmpneq_epu64_mask'. Requires AVX512F.

func M512CosPd

func M512CosPd(a x86.M512d) (dst x86.M512d)

M512CosPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cos_pd'. Requires AVX512F.

func M512CosPs

func M512CosPs(a x86.M512) (dst x86.M512)

M512CosPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cos_ps'. Requires AVX512F.

func M512CosdPd

func M512CosdPd(a x86.M512d) (dst x86.M512d)

M512CosdPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := COSD(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cosd_pd'. Requires AVX512F.

func M512CosdPs

func M512CosdPs(a x86.M512) (dst x86.M512)

M512CosdPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := COSD(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cosd_ps'. Requires AVX512F.

func M512CoshPd

func M512CoshPd(a x86.M512d) (dst x86.M512d)

M512CoshPd: Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := COSH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cosh_pd'. Requires AVX512F.

func M512CoshPs

func M512CoshPs(a x86.M512) (dst x86.M512)

M512CoshPs: Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := COSH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_cosh_ps'. Requires AVX512F.

func M512CvtRoundepi32Ps

func M512CvtRoundepi32Ps(a x86.M512i, rounding int) (dst x86.M512)

M512CvtRoundepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_cvt_roundepi32_ps'. Requires AVX512F.

func M512CvtRoundepu32Ps

func M512CvtRoundepu32Ps(a x86.M512i, rounding int) (dst x86.M512)

M512CvtRoundepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_cvt_roundepu32_ps'. Requires AVX512F.

func M512CvtRoundpdEpi32

func M512CvtRoundpdEpi32(a x86.M512d, rounding int) (dst x86.M256i)

M512CvtRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 32*j
			k := 64*j
			dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_cvt_roundpd_epi32'. Requires AVX512F.

func M512CvtRoundpdEpu32

func M512CvtRoundpdEpu32(a x86.M512d, rounding int) (dst x86.M256i)

M512CvtRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 32*j
			k := 64*j
			dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_cvt_roundpd_epu32'. Requires AVX512F.

func M512CvtRoundpdPs

func M512CvtRoundpdPs(a x86.M512d, rounding int) (dst x86.M256)

M512CvtRoundpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 32*j
			k := 64*j
			dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_cvt_roundpd_ps'. Requires AVX512F.

func M512CvtRoundphPs

func M512CvtRoundphPs(a x86.M256i, sae int) (dst x86.M512)

M512CvtRoundphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		m := j*16
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_cvt_roundph_ps'. Requires AVX512F.

func M512CvtRoundpsEpi32

func M512CvtRoundpsEpi32(a x86.M512, rounding int) (dst x86.M512i)

M512CvtRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_cvt_roundps_epi32'. Requires AVX512F.

func M512CvtRoundpsEpu32

func M512CvtRoundpsEpu32(a x86.M512, rounding int) (dst x86.M512i)

M512CvtRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_cvt_roundps_epu32'. Requires AVX512F.

func M512CvtRoundpsPd

func M512CvtRoundpsPd(a x86.M256, sae int) (dst x86.M512d)

M512CvtRoundpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 64*j
		k := 32*j
		dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_cvt_roundps_pd'. Requires AVX512F.

func M512CvtRoundpsPh

func M512CvtRoundpsPh(a x86.M512, rounding int) (dst x86.M256i)

M512CvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_cvt_roundps_ph'. Requires AVX512F.

func M512Cvtepi16Epi32

func M512Cvtepi16Epi32(a x86.M256i) (dst x86.M512i)

M512Cvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 16*j
	dst[i+31:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm512_cvtepi16_epi32'. Requires AVX512F.

func M512Cvtepi16Epi64

func M512Cvtepi16Epi64(a x86.M128i) (dst x86.M512i)

M512Cvtepi16Epi64: Sign extend packed 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 16*j
	dst[i+63:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm512_cvtepi16_epi64'. Requires AVX512F.

func M512Cvtepi32Epi16

func M512Cvtepi32Epi16(a x86.M512i) (dst x86.M256i)

M512Cvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 16*j
	dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm512_cvtepi32_epi16'. Requires AVX512F.

func M512Cvtepi32Epi64

func M512Cvtepi32Epi64(a x86.M256i) (dst x86.M512i)

M512Cvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[i+63:i] := SignExtend(a[k+31:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm512_cvtepi32_epi64'. Requires AVX512F.

func M512Cvtepi32Epi8

func M512Cvtepi32Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 8*j
	dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm512_cvtepi32_epi8'. Requires AVX512F.

func M512Cvtepi32Pd

func M512Cvtepi32Pd(a x86.M256i) (dst x86.M512d)

M512Cvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	m := j*64
	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_cvtepi32_pd'. Requires AVX512F.

func M512Cvtepi32Ps

func M512Cvtepi32Ps(a x86.M512i) (dst x86.M512)

M512Cvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_cvtepi32_ps'. Requires AVX512F.

func M512Cvtepi64Epi16

func M512Cvtepi64Epi16(a x86.M512i) (dst x86.M128i)

M512Cvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 16*j
	dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm512_cvtepi64_epi16'. Requires AVX512F.

func M512Cvtepi64Epi32

func M512Cvtepi64Epi32(a x86.M512i) (dst x86.M256i)

M512Cvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm512_cvtepi64_epi32'. Requires AVX512F.

func M512Cvtepi64Epi8

func M512Cvtepi64Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 8*j
	dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm512_cvtepi64_epi8'. Requires AVX512F.

func M512Cvtepi8Epi32

func M512Cvtepi8Epi32(a x86.M128i) (dst x86.M512i)

M512Cvtepi8Epi32: Sign extend packed 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 8*j
	dst[i+31:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm512_cvtepi8_epi32'. Requires AVX512F.

func M512Cvtepi8Epi64

func M512Cvtepi8Epi64(a x86.M128i) (dst x86.M512i)

M512Cvtepi8Epi64: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 8*j
	dst[i+63:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm512_cvtepi8_epi64'. Requires AVX512F.

func M512Cvtepu16Epi32

func M512Cvtepu16Epi32(a x86.M256i) (dst x86.M512i)

M512Cvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 16*j
	dst[i+31:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm512_cvtepu16_epi32'. Requires AVX512F.

func M512Cvtepu16Epi64

func M512Cvtepu16Epi64(a x86.M128i) (dst x86.M512i)

M512Cvtepu16Epi64: Zero extend packed unsigned 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 16*j
	dst[i+63:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm512_cvtepu16_epi64'. Requires AVX512F.

func M512Cvtepu32Epi64

func M512Cvtepu32Epi64(a x86.M256i) (dst x86.M512i)

M512Cvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[i+63:i] := ZeroExtend(a[k+31:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm512_cvtepu32_epi64'. Requires AVX512F.

func M512Cvtepu32Pd

func M512Cvtepu32Pd(a x86.M256i) (dst x86.M512d)

M512Cvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_cvtepu32_pd'. Requires AVX512F.

func M512Cvtepu32Ps

func M512Cvtepu32Ps(a x86.M512i) (dst x86.M512)

M512Cvtepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_cvtepu32_ps'. Requires AVX512F.

func M512Cvtepu8Epi32

func M512Cvtepu8Epi32(a x86.M128i) (dst x86.M512i)

M512Cvtepu8Epi32: Zero extend packed unsigned 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 8*j
	dst[i+31:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm512_cvtepu8_epi32'. Requires AVX512F.

func M512Cvtepu8Epi64

func M512Cvtepu8Epi64(a x86.M128i) (dst x86.M512i)

M512Cvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 8 byte sof 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 8*j
	dst[i+63:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm512_cvtepu8_epi64'. Requires AVX512F.

func M512CvtpdEpi32

func M512CvtpdEpi32(a x86.M512d) (dst x86.M256i)

M512CvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_cvtpd_epi32'. Requires AVX512F.

func M512CvtpdEpu32

func M512CvtpdEpu32(a x86.M512d) (dst x86.M256i)

M512CvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_cvtpd_epu32'. Requires AVX512F.

func M512CvtpdPs

func M512CvtpdPs(a x86.M512d) (dst x86.M256)

M512CvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_cvtpd_ps'. Requires AVX512F.

func M512CvtphPs

func M512CvtphPs(a x86.M256i) (dst x86.M512)

M512CvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	m := j*16
	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_cvtph_ps'. Requires AVX512F.

func M512CvtpsEpi32

func M512CvtpsEpi32(a x86.M512) (dst x86.M512i)

M512CvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_cvtps_epi32'. Requires AVX512F.

func M512CvtpsEpu32

func M512CvtpsEpu32(a x86.M512) (dst x86.M512i)

M512CvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_cvtps_epu32'. Requires AVX512F.

func M512CvtpsPd

func M512CvtpsPd(a x86.M256) (dst x86.M512d)

M512CvtpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_cvtps_pd'. Requires AVX512F.

func M512CvtpsPh

func M512CvtpsPh(a x86.M512, rounding int) (dst x86.M256i)

M512CvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_cvtps_ph'. Requires AVX512F.

func M512Cvtsepi32Epi16

func M512Cvtsepi32Epi16(a x86.M512i) (dst x86.M256i)

M512Cvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm512_cvtsepi32_epi16'. Requires AVX512F.

func M512Cvtsepi32Epi8

func M512Cvtsepi32Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm512_cvtsepi32_epi8'. Requires AVX512F.

func M512Cvtsepi64Epi16

func M512Cvtsepi64Epi16(a x86.M512i) (dst x86.M128i)

M512Cvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm512_cvtsepi64_epi16'. Requires AVX512F.

func M512Cvtsepi64Epi32

func M512Cvtsepi64Epi32(a x86.M512i) (dst x86.M256i)

M512Cvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm512_cvtsepi64_epi32'. Requires AVX512F.

func M512Cvtsepi64Epi8

func M512Cvtsepi64Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm512_cvtsepi64_epi8'. Requires AVX512F.

func M512CvttRoundpdEpi32

func M512CvttRoundpdEpi32(a x86.M512d, sae int) (dst x86.M256i)

M512CvttRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 32*i
		k := 64*j
		dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[k+63:k])
	ENDFOR
	dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_cvtt_roundpd_epi32'. Requires AVX512F.

func M512CvttRoundpdEpu32

func M512CvttRoundpdEpu32(a x86.M512d, sae int) (dst x86.M256i)

M512CvttRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 32*i
		k := 64*j
		dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[k+63:k])
	ENDFOR
	dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_cvtt_roundpd_epu32'. Requires AVX512F.

func M512CvttRoundpsEpi32

func M512CvttRoundpsEpi32(a x86.M512, sae int) (dst x86.M512i)

M512CvttRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_cvtt_roundps_epi32'. Requires AVX512F.

func M512CvttRoundpsEpu32

func M512CvttRoundpsEpu32(a x86.M512, sae int) (dst x86.M512i)

M512CvttRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_cvtt_roundps_epu32'. Requires AVX512F.

func M512CvttpdEpi32

func M512CvttpdEpi32(a x86.M512d) (dst x86.M256i)

M512CvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_cvttpd_epi32'. Requires AVX512F.

func M512CvttpdEpu32

func M512CvttpdEpu32(a x86.M512d) (dst x86.M256i)

M512CvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_cvttpd_epu32'. Requires AVX512F.

func M512CvttpsEpi32

func M512CvttpsEpi32(a x86.M512) (dst x86.M512i)

M512CvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_cvttps_epi32'. Requires AVX512F.

func M512CvttpsEpu32

func M512CvttpsEpu32(a x86.M512) (dst x86.M512i)

M512CvttpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_cvttps_epu32'. Requires AVX512F.

func M512Cvtusepi32Epi16

func M512Cvtusepi32Epi16(a x86.M512i) (dst x86.M256i)

M512Cvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm512_cvtusepi32_epi16'. Requires AVX512F.

func M512Cvtusepi32Epi8

func M512Cvtusepi32Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm512_cvtusepi32_epi8'. Requires AVX512F.

func M512Cvtusepi64Epi16

func M512Cvtusepi64Epi16(a x86.M512i) (dst x86.M128i)

M512Cvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 16*j
	dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm512_cvtusepi64_epi16'. Requires AVX512F.

func M512Cvtusepi64Epi32

func M512Cvtusepi64Epi32(a x86.M512i) (dst x86.M256i)

M512Cvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 32*j
	dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm512_cvtusepi64_epi32'. Requires AVX512F.

func M512Cvtusepi64Epi8

func M512Cvtusepi64Epi8(a x86.M512i) (dst x86.M128i)

M512Cvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	k := 8*j
	dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm512_cvtusepi64_epi8'. Requires AVX512F.

func M512DivEpi16

func M512DivEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpi16: Divide packed 16-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 31
	i := 16*j
	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epi16'. Requires AVX512F.

func M512DivEpi32

func M512DivEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epi32'. Requires AVX512F.

func M512DivEpi64

func M512DivEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpi64: Divide packed 64-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epi64'. Requires AVX512F.

func M512DivEpi8

func M512DivEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpi8: Divide packed 8-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 63
	i := 8*j
	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epi8'. Requires AVX512F.

func M512DivEpu16

func M512DivEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpu16: Divide packed unsigned 16-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 31
	i := 16*j
	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epu16'. Requires AVX512F.

func M512DivEpu32

func M512DivEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epu32'. Requires AVX512F.

func M512DivEpu64

func M512DivEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpu64: Divide packed unsigned 64-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epu64'. Requires AVX512F.

func M512DivEpu8

func M512DivEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512DivEpu8: Divide packed unsigned 8-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 63
	i := 8*j
	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_div_epu8'. Requires AVX512F.

func M512DivPd

func M512DivPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512DivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := 64*j
	dst[i+63:i] := a[i+63:i] / b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_div_pd'. Requires AVX512F.

func M512DivPs

func M512DivPs(a x86.M512, b x86.M512) (dst x86.M512)

M512DivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := a[i+31:i] / b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_div_ps'. Requires AVX512F.

func M512DivRoundPd

func M512DivRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512DivRoundPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', =and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 64*j
			dst[i+63:i] := a[i+63:i] / b[i+63:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_div_round_pd'. Requires AVX512F.

func M512DivRoundPs

func M512DivRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512DivRoundPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			dst[i+31:i] := a[i+31:i] / b[i+31:i]
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_div_round_ps'. Requires AVX512F.

func M512ErfPd

func M512ErfPd(a x86.M512d) (dst x86.M512d)

M512ErfPd: Compute the error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ERF(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erf_pd'. Requires AVX512F.

func M512ErfPs

func M512ErfPs(a x86.M512) (dst x86.M512)

M512ErfPs: Compute the error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ERF(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erf_ps'. Requires AVX512F.

func M512ErfcPd

func M512ErfcPd(a x86.M512d) (dst x86.M512d)

M512ErfcPd: Compute the complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfc_pd'. Requires AVX512F.

func M512ErfcPs

func M512ErfcPs(a x86.M512) (dst x86.M512)

M512ErfcPs: Compute the complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := 1.0 - ERF(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfc_ps'. Requires AVX512F.

func M512ErfcinvPd

func M512ErfcinvPd(a x86.M512d) (dst x86.M512d)

M512ErfcinvPd: Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfcinv_pd'. Requires AVX512F.

func M512ErfcinvPs

func M512ErfcinvPs(a x86.M512) (dst x86.M512)

M512ErfcinvPs: Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfcinv_ps'. Requires AVX512F.

func M512ErfinvPd

func M512ErfinvPd(a x86.M512d) (dst x86.M512d)

M512ErfinvPd: Compute the inverse error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfinv_pd'. Requires AVX512F.

func M512ErfinvPs

func M512ErfinvPs(a x86.M512) (dst x86.M512)

M512ErfinvPs: Compute the inverse error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := 1.0 / ERF(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_erfinv_ps'. Requires AVX512F.

func M512Exp10Pd

func M512Exp10Pd(a x86.M512d) (dst x86.M512d)

M512Exp10Pd: Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := 10^(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp10_pd'. Requires AVX512F.

func M512Exp10Ps

func M512Exp10Ps(a x86.M512) (dst x86.M512)

M512Exp10Ps: Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := 10^(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp10_ps'. Requires AVX512F.

func M512Exp2Pd

func M512Exp2Pd(a x86.M512d) (dst x86.M512d)

M512Exp2Pd: Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := 2^(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp2_pd'. Requires AVX512F.

func M512Exp2Ps

func M512Exp2Ps(a x86.M512) (dst x86.M512)

M512Exp2Ps: Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := 2^(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp2_ps'. Requires AVX512F.

func M512ExpPd

func M512ExpPd(a x86.M512d) (dst x86.M512d)

M512ExpPd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := e^(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp_pd'. Requires AVX512F.

func M512ExpPs

func M512ExpPs(a x86.M512) (dst x86.M512)

M512ExpPs: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_exp_ps'. Requires AVX512F.

func M512Expm1Pd

func M512Expm1Pd(a x86.M512d) (dst x86.M512d)

M512Expm1Pd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := e^(a[i+63:i]) - 1.0
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_expm1_pd'. Requires AVX512F.

func M512Expm1Ps

func M512Expm1Ps(a x86.M512) (dst x86.M512)

M512Expm1Ps: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := e^(a[i+31:i]) - 1.0
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_expm1_ps'. Requires AVX512F.

func M512Extractf32x4Ps

func M512Extractf32x4Ps(a x86.M512, imm8 byte) (dst x86.M128)

M512Extractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm512_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Extractf64x4Pd

func M512Extractf64x4Pd(a x86.M512d, imm8 byte) (dst x86.M256d)

M512Extractf64x4Pd: Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0

Instruction: 'VEXTRACTF64X4'. Intrinsic: '_mm512_extractf64x4_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Extracti32x4Epi32

func M512Extracti32x4Epi32(a x86.M512i, imm8 byte) (dst x86.M128i)

M512Extracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm512_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Extracti64x4Epi64

func M512Extracti64x4Epi64(a x86.M512i, imm8 byte) (dst x86.M256i)

M512Extracti64x4Epi64: Extract 256 bits (composed of 4 packed 64-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0

Instruction: 'VEXTRACTI64X4'. Intrinsic: '_mm512_extracti64x4_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512FixupimmPd

func M512FixupimmPd(a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)

M512FixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512FixupimmPs

func M512FixupimmPs(a x86.M512, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)

M512FixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512FixupimmRoundPd

func M512FixupimmRoundPd(a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)

M512FixupimmRoundPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_fixupimm_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512FixupimmRoundPs

func M512FixupimmRoundPs(a x86.M512, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)

M512FixupimmRoundPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_fixupimm_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512FloorPd

func M512FloorPd(a x86.M512d) (dst x86.M512d)

M512FloorPd: Round the packed double-precision (64-bit) floating-point elements in 'a' down to an integer value, and store the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_floor_pd'. Requires AVX512F.

func M512FloorPs

func M512FloorPs(a x86.M512) (dst x86.M512)

M512FloorPs: Round the packed single-precision (32-bit) floating-point elements in 'a' down to an integer value, and store the results as packed single-precision floating-point elements in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_floor_ps'. Requires AVX512F.

func M512FmaddsubPd

func M512FmaddsubPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF (j is even)
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_fmaddsub_pd'. Requires AVX512F.

func M512FmaddsubPs

func M512FmaddsubPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF (j is even)
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_fmaddsub_ps'. Requires AVX512F.

func M512FmaddsubRoundPd

func M512FmaddsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512FmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF (j is even)
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
			ELSE
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_fmaddsub_round_pd'. Requires AVX512F.

func M512FmaddsubRoundPs

func M512FmaddsubRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512FmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF (j is even)
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
			ELSE
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_fmaddsub_round_ps'. Requires AVX512F.

func M512FmsubaddPd

func M512FmsubaddPd(a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF (j is even)
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_fmsubadd_pd'. Requires AVX512F.

func M512FmsubaddPs

func M512FmsubaddPs(a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF (j is even)
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_fmsubadd_ps'. Requires AVX512F.

func M512FmsubaddRoundPd

func M512FmsubaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512FmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF (j is even)
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
			ELSE
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_fmsubadd_round_pd'. Requires AVX512F.

func M512FmsubaddRoundPs

func M512FmsubaddRoundPs(a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512FmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF (j is even)
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
			ELSE
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_fmsubadd_round_ps'. Requires AVX512F.

func M512HypotPd

func M512HypotPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512HypotPd: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_hypot_pd'. Requires AVX512F.

func M512HypotPs

func M512HypotPs(a x86.M512, b x86.M512) (dst x86.M512)

M512HypotPs: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_hypot_ps'. Requires AVX512F.

func M512Insertf32x4

func M512Insertf32x4(a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)

M512Insertf32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm512_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Insertf64x4

func M512Insertf64x4(a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)

M512Insertf64x4: Copy 'a' to 'dst', then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE (imm8[0]) of
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTF64X4'. Intrinsic: '_mm512_insertf64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Inserti32x4

func M512Inserti32x4(a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512Inserti32x4: Copy 'a' to 'dst', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm512_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Inserti64x4

func M512Inserti64x4(a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512Inserti64x4: Copy 'a' to 'dst', then insert 256 bits (composed of 4 packed 64-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTI64X4'. Intrinsic: '_mm512_inserti64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512InvsqrtPd

func M512InvsqrtPd(a x86.M512d) (dst x86.M512d)

M512InvsqrtPd: Compute the inverse square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := InvSQRT(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_invsqrt_pd'. Requires AVX512F.

func M512InvsqrtPs

func M512InvsqrtPs(a x86.M512) (dst x86.M512)

M512InvsqrtPs: Compute the inverse square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := InvSQRT(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_invsqrt_ps'. Requires AVX512F.

func M512Kand

func M512Kand(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kand: Compute the bitwise AND of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := a[15:0] AND b[15:0]
k[MAX:16] := 0

Instruction: 'KANDW'. Intrinsic: '_mm512_kand'. Requires AVX512F.

func M512Kandn

func M512Kandn(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kandn: Compute the bitwise AND NOT of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := (NOT a[15:0]) AND b[15:0]
k[MAX:16] := 0

Instruction: 'KANDNW'. Intrinsic: '_mm512_kandn'. Requires AVX512F.

func M512Kmov

func M512Kmov(a x86.Mmask16) (dst x86.Mmask16)

M512Kmov: Copy 16-bit mask 'a' to 'k'.

k[15:0] := a[15:0]
k[MAX:16] := 0

Instruction: 'KMOVW'. Intrinsic: '_mm512_kmov'. Requires AVX512F.

func M512Knot

func M512Knot(a x86.Mmask16) (dst x86.Mmask16)

M512Knot: Compute the bitwise NOT of 16-bit mask 'a', and store the result in 'k'.

k[15:0] := NOT a[15:0]
k[MAX:16] := 0

Instruction: 'KNOTW'. Intrinsic: '_mm512_knot'. Requires AVX512F.

func M512Kor

func M512Kor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kor: Compute the bitwise OR of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := a[15:0] OR b[15:0]
k[MAX:16] := 0

Instruction: 'KORW'. Intrinsic: '_mm512_kor'. Requires AVX512F.

func M512Kortestc

func M512Kortestc(k1 x86.Mmask16, k2 x86.Mmask16) int

M512Kortestc: Performs bitwise OR between 'k1' and 'k2', storing the result in 'dst'. CF flag is set if 'dst' consists of all 1's.

dst[15:0] := k1[15:0] | k2[15:0]
IF PopCount(dst[15:0]) = 16
	SetCF()
FI

Instruction: 'KORTESTW'. Intrinsic: '_mm512_kortestc'. Requires AVX512F.

func M512Kortestz

func M512Kortestz(k1 x86.Mmask16, k2 x86.Mmask16) int

M512Kortestz: Performs bitwise OR between 'k1' and 'k2', storing the result in 'dst'. ZF flag is set if 'dst' is 0.

dst[15:0] := k1[15:0] | k2[15:0]
IF dst = 0
	SetZF()
FI

Instruction: 'KORTESTW'. Intrinsic: '_mm512_kortestz'. Requires AVX512F.

func M512Kunpackb

func M512Kunpackb(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kunpackb: Unpack and interleave 8 bits from masks 'a' and 'b', and store the 16-bit result in 'k'.

k[7:0] := b[7:0]
k[15:8] := a[7:0]
k[MAX:16] := 0

Instruction: 'KUNPCKBW'. Intrinsic: '_mm512_kunpackb'. Requires AVX512F.

func M512Kxnor

func M512Kxnor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kxnor: Compute the bitwise XNOR of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := NOT (a[15:0] XOR b[15:0])
k[MAX:16] := 0

Instruction: 'KXNORW'. Intrinsic: '_mm512_kxnor'. Requires AVX512F.

func M512Kxor

func M512Kxor(a x86.Mmask16, b x86.Mmask16) (dst x86.Mmask16)

M512Kxor: Compute the bitwise XOR of 16-bit masks 'a' and 'b', and store the result in 'k'.

k[15:0] := a[15:0] XOR b[15:0]
k[MAX:16] := 0

Instruction: 'KXORW'. Intrinsic: '_mm512_kxor'. Requires AVX512F.

func M512Log10Pd

func M512Log10Pd(a x86.M512d) (dst x86.M512d)

M512Log10Pd: Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := log10(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log10_pd'. Requires AVX512F.

func M512Log10Ps

func M512Log10Ps(a x86.M512) (dst x86.M512)

M512Log10Ps: Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := log10(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log10_ps'. Requires AVX512F.

func M512Log1pPd

func M512Log1pPd(a x86.M512d) (dst x86.M512d)

M512Log1pPd: Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ln(1.0 + a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log1p_pd'. Requires AVX512F.

func M512Log1pPs

func M512Log1pPs(a x86.M512) (dst x86.M512)

M512Log1pPs: Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ln(1.0 + a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log1p_ps'. Requires AVX512F.

func M512Log2Pd

func M512Log2Pd(a x86.M512d) (dst x86.M512d)

M512Log2Pd: Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := log2(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log2_pd'. Requires AVX512F.

func M512LogPd

func M512LogPd(a x86.M512d) (dst x86.M512d)

M512LogPd: Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ln(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log_pd'. Requires AVX512F.

func M512LogPs

func M512LogPs(a x86.M512) (dst x86.M512)

M512LogPs: Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_log_ps'. Requires AVX512F.

func M512LogbPd

func M512LogbPd(a x86.M512d) (dst x86.M512d)

M512LogbPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_logb_pd'. Requires AVX512F.

func M512LogbPs

func M512LogbPs(a x86.M512) (dst x86.M512)

M512LogbPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_logb_ps'. Requires AVX512F.

func M512Mask2Permutex2varEpi32

func M512Mask2Permutex2varEpi32(a x86.M512i, idx x86.M512i, k x86.Mmask16, b x86.M512i) (dst x86.M512i)

M512Mask2Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2D'. Intrinsic: '_mm512_mask2_permutex2var_epi32'. Requires AVX512F.

func M512Mask2Permutex2varEpi64

func M512Mask2Permutex2varEpi64(a x86.M512i, idx x86.M512i, k x86.Mmask8, b x86.M512i) (dst x86.M512i)

M512Mask2Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2Q'. Intrinsic: '_mm512_mask2_permutex2var_epi64'. Requires AVX512F.

func M512Mask2Permutex2varPd

func M512Mask2Permutex2varPd(a x86.M512d, idx x86.M512i, k x86.Mmask8, b x86.M512d) (dst x86.M512d)

M512Mask2Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set)

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PD'. Intrinsic: '_mm512_mask2_permutex2var_pd'. Requires AVX512F.

func M512Mask2Permutex2varPs

func M512Mask2Permutex2varPs(a x86.M512, idx x86.M512i, k x86.Mmask16, b x86.M512) (dst x86.M512)

M512Mask2Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PS'. Intrinsic: '_mm512_mask2_permutex2var_ps'. Requires AVX512F.

func M512Mask3FmaddsubPd

func M512Mask3FmaddsubPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)

M512Mask3FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask3_fmaddsub_pd'. Requires AVX512F.

func M512Mask3FmaddsubPs

func M512Mask3FmaddsubPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)

M512Mask3FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask3_fmaddsub_ps'. Requires AVX512F.

func M512Mask3FmaddsubRoundPd

func M512Mask3FmaddsubRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)

M512Mask3FmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask3_fmaddsub_round_pd'. Requires AVX512F.

func M512Mask3FmaddsubRoundPs

func M512Mask3FmaddsubRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)

M512Mask3FmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask3_fmaddsub_round_ps'. Requires AVX512F.

func M512Mask3FmsubaddPd

func M512Mask3FmsubaddPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8) (dst x86.M512d)

M512Mask3FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask3_fmsubadd_pd'. Requires AVX512F.

func M512Mask3FmsubaddPs

func M512Mask3FmsubaddPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16) (dst x86.M512)

M512Mask3FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask3_fmsubadd_ps'. Requires AVX512F.

func M512Mask3FmsubaddRoundPd

func M512Mask3FmsubaddRoundPd(a x86.M512d, b x86.M512d, c x86.M512d, k x86.Mmask8, rounding int) (dst x86.M512d)

M512Mask3FmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := c[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask3_fmsubadd_round_pd'. Requires AVX512F.

func M512Mask3FmsubaddRoundPs

func M512Mask3FmsubaddRoundPs(a x86.M512, b x86.M512, c x86.M512, k x86.Mmask16, rounding int) (dst x86.M512)

M512Mask3FmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := c[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask3_fmsubadd_round_ps'. Requires AVX512F.

func M512MaskAbsEpi32

func M512MaskAbsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm512_mask_abs_epi32'. Requires AVX512F.

func M512MaskAbsEpi64

func M512MaskAbsEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm512_mask_abs_epi64'. Requires AVX512F.

func M512MaskAcosPd

func M512MaskAcosPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAcosPd: Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ACOS(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_acos_pd'. Requires AVX512F.

func M512MaskAcosPs

func M512MaskAcosPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAcosPs: Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ACOS(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_acos_ps'. Requires AVX512F.

func M512MaskAcoshPd

func M512MaskAcoshPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAcoshPd: Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ACOSH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_acosh_pd'. Requires AVX512F.

func M512MaskAcoshPs

func M512MaskAcoshPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAcoshPs: Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ACOSH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_acosh_ps'. Requires AVX512F.

func M512MaskAddEpi64

func M512MaskAddEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm512_mask_add_epi64'. Requires AVX512F.

func M512MaskAlignrEpi64

func M512MaskAlignrEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskAlignrEpi64: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 64-bit elements, and store the low 64 bytes (8 elements) in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (64*count)
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := temp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VALIGNQ'. Intrinsic: '_mm512_mask_alignr_epi64'. Requires AVX512F.

func M512MaskAsinPd

func M512MaskAsinPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAsinPd: Compute the inverse sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ASIN(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_asin_pd'. Requires AVX512F.

func M512MaskAsinPs

func M512MaskAsinPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAsinPs: Compute the inverse sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ASIN(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_asin_ps'. Requires AVX512F.

func M512MaskAsinhPd

func M512MaskAsinhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAsinhPd: Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ASINH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_asinh_pd'. Requires AVX512F.

func M512MaskAsinhPs

func M512MaskAsinhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAsinhPs: Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ASINH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_asinh_ps'. Requires AVX512F.

func M512MaskAtan2Pd

func M512MaskAtan2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskAtan2Pd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to
	i := j*64
	IF k[j]
		dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atan2_pd'. Requires AVX512F.

func M512MaskAtan2Ps

func M512MaskAtan2Ps(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskAtan2Ps: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atan2_ps'. Requires AVX512F.

func M512MaskAtanPd

func M512MaskAtanPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAtanPd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ATAN(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atan_pd'. Requires AVX512F.

func M512MaskAtanPs

func M512MaskAtanPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAtanPs: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ATAN(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atan_ps'. Requires AVX512F.

func M512MaskAtanhPd

func M512MaskAtanhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskAtanhPd: Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' expressed in radians using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ATANH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atanh_pd'. Requires AVX512F.

func M512MaskAtanhPs

func M512MaskAtanhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskAtanhPs: Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ATANH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_atanh_ps'. Requires AVX512F.

func M512MaskBroadcastF32x4

func M512MaskBroadcastF32x4(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm512_mask_broadcast_f32x4'. Requires AVX512F.

func M512MaskBroadcastF64x4

func M512MaskBroadcastF64x4(src x86.M512d, k x86.Mmask8, a x86.M256d) (dst x86.M512d)

M512MaskBroadcastF64x4: Broadcast the 4 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X4'. Intrinsic: '_mm512_mask_broadcast_f64x4'. Requires AVX512F.

func M512MaskBroadcastI32x4

func M512MaskBroadcastI32x4(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm512_mask_broadcast_i32x4'. Requires AVX512F.

func M512MaskBroadcastI64x4

func M512MaskBroadcastI64x4(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskBroadcastI64x4: Broadcast the 4 packed 64-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X4'. Intrinsic: '_mm512_mask_broadcast_i64x4'. Requires AVX512F.

func M512MaskBroadcastdEpi32

func M512MaskBroadcastdEpi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_mask_broadcastd_epi32'. Requires AVX512F.

func M512MaskBroadcastqEpi64

func M512MaskBroadcastqEpi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_mask_broadcastq_epi64'. Requires AVX512F.

func M512MaskBroadcastsdPd

func M512MaskBroadcastsdPd(src x86.M512d, k x86.Mmask8, a x86.M128d) (dst x86.M512d)

M512MaskBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm512_mask_broadcastsd_pd'. Requires AVX512F.

func M512MaskBroadcastssPs

func M512MaskBroadcastssPs(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm512_mask_broadcastss_ps'. Requires AVX512F.

func M512MaskCbrtPd

func M512MaskCbrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCbrtPd: Compute the cube root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := CubeRoot(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cbrt_pd'. Requires AVX512F.

func M512MaskCbrtPs

func M512MaskCbrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCbrtPs: Compute the cube root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := CubeRoot(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cbrt_ps'. Requires AVX512F.

func M512MaskCdfnormPd

func M512MaskCdfnormPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCdfnormPd: Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := CDFNormal(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorm_pd'. Requires AVX512F.

func M512MaskCdfnormPs

func M512MaskCdfnormPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCdfnormPs: Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := CDFNormal(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorm_ps'. Requires AVX512F.

func M512MaskCdfnorminvPd

func M512MaskCdfnorminvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCdfnorminvPd: Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := InverseCDFNormal(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorminv_pd'. Requires AVX512F.

func M512MaskCdfnorminvPs

func M512MaskCdfnorminvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCdfnorminvPs: Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := InverseCDFNormal(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cdfnorminv_ps'. Requires AVX512F.

func M512MaskCeilPd

func M512MaskCeilPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCeilPd: Round the packed double-precision (64-bit) floating-point elements in 'a' up to an integer value, and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := CEIL(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_ceil_pd'. Requires AVX512F.

func M512MaskCeilPs

func M512MaskCeilPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCeilPs: Round the packed single-precision (32-bit) floating-point elements in 'a' up to an integer value, and store the results as packed single-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := CEIL(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_ceil_ps'. Requires AVX512F.

func M512MaskCmpEpi64Mask

func M512MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)

M512MaskCmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpEpu64Mask

func M512MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask8)

M512MaskCmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpeqEpi64Mask

func M512MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPEQQ'. Intrinsic: '_mm512_mask_cmpeq_epi64_mask'. Requires AVX512F.

func M512MaskCmpeqEpu64Mask

func M512MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpeq_epu64_mask'. Requires AVX512F.

func M512MaskCmpgeEpi64Mask

func M512MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmpge_epi64_mask'. Requires AVX512F.

func M512MaskCmpgeEpu64Mask

func M512MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpge_epu64_mask'. Requires AVX512F.

func M512MaskCmpgtEpi64Mask

func M512MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPGTQ'. Intrinsic: '_mm512_mask_cmpgt_epi64_mask'. Requires AVX512F.

func M512MaskCmpgtEpu64Mask

func M512MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpgt_epu64_mask'. Requires AVX512F.

func M512MaskCmpleEpi64Mask

func M512MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmple_epi64_mask'. Requires AVX512F.

func M512MaskCmpleEpu64Mask

func M512MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmple_epu64_mask'. Requires AVX512F.

func M512MaskCmpltEpi32Mask

func M512MaskCmpltEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskCmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm512_mask_cmplt_epi32_mask'. Requires AVX512F.

func M512MaskCmpltEpi64Mask

func M512MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmplt_epi64_mask'. Requires AVX512F.

func M512MaskCmpltEpu64Mask

func M512MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmplt_epu64_mask'. Requires AVX512F.

func M512MaskCmpneqEpi64Mask

func M512MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm512_mask_cmpneq_epi64_mask'. Requires AVX512F.

func M512MaskCmpneqEpu64Mask

func M512MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskCmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm512_mask_cmpneq_epu64_mask'. Requires AVX512F.

func M512MaskCompressEpi32

func M512MaskCompressEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm512_mask_compress_epi32'. Requires AVX512F.

func M512MaskCompressEpi64

func M512MaskCompressEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm512_mask_compress_epi64'. Requires AVX512F.

func M512MaskCompressPd

func M512MaskCompressPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm512_mask_compress_pd'. Requires AVX512F.

func M512MaskCompressPs

func M512MaskCompressPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm512_mask_compress_ps'. Requires AVX512F.

func M512MaskCosPd

func M512MaskCosPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCosPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := COS(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cos_pd'. Requires AVX512F.

func M512MaskCosPs

func M512MaskCosPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCosPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := COS(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cos_ps'. Requires AVX512F.

func M512MaskCosdPd

func M512MaskCosdPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCosdPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := COSD(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cosd_pd'. Requires AVX512F.

func M512MaskCosdPs

func M512MaskCosdPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCosdPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := COSD(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cosd_ps'. Requires AVX512F.

func M512MaskCoshPd

func M512MaskCoshPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskCoshPd: Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := COSH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cosh_pd'. Requires AVX512F.

func M512MaskCoshPs

func M512MaskCoshPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskCoshPs: Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := COSH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_cosh_ps'. Requires AVX512F.

func M512MaskCvtRoundepi32Ps

func M512MaskCvtRoundepi32Ps(src x86.M512, k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)

M512MaskCvtRoundepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_mask_cvt_roundepi32_ps'. Requires AVX512F.

func M512MaskCvtRoundepu32Ps

func M512MaskCvtRoundepu32Ps(src x86.M512, k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)

M512MaskCvtRoundepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_mask_cvt_roundepu32_ps'. Requires AVX512F.

func M512MaskCvtRoundpdEpi32

func M512MaskCvtRoundpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)

M512MaskCvtRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*32
			l := j*64
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_mask_cvt_roundpd_epi32'. Requires AVX512F.

func M512MaskCvtRoundpdEpu32

func M512MaskCvtRoundpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)

M512MaskCvtRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*32
			l := j*64
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_mask_cvt_roundpd_epu32'. Requires AVX512F.

func M512MaskCvtRoundpdPs

func M512MaskCvtRoundpdPs(src x86.M256, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256)

M512MaskCvtRoundpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*32
			l := j*64
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_mask_cvt_roundpd_ps'. Requires AVX512F.

func M512MaskCvtRoundphPs

func M512MaskCvtRoundphPs(src x86.M512, k x86.Mmask16, a x86.M256i, sae int) (dst x86.M512)

M512MaskCvtRoundphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		m := j*16
		IF k[j]
			dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_mask_cvt_roundph_ps'. Requires AVX512F.

func M512MaskCvtRoundpsEpi32

func M512MaskCvtRoundpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)

M512MaskCvtRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_mask_cvt_roundps_epi32'. Requires AVX512F.

func M512MaskCvtRoundpsEpu32

func M512MaskCvtRoundpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)

M512MaskCvtRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_mask_cvt_roundps_epu32'. Requires AVX512F.

func M512MaskCvtRoundpsPd

func M512MaskCvtRoundpsPd(src x86.M512d, k x86.Mmask8, a x86.M256, sae int) (dst x86.M512d)

M512MaskCvtRoundpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 64*j
		l := 32*j
		IF k[j]
			dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_mask_cvt_roundps_pd'. Requires AVX512F.

func M512MaskCvtRoundpsPh

func M512MaskCvtRoundpsPh(src x86.M256i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)

M512MaskCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_mask_cvt_roundps_ph'. Requires AVX512F.

func M512MaskCvtepi16Epi32

func M512MaskCvtepi16Epi32(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	l := j*16
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm512_mask_cvtepi16_epi32'. Requires AVX512F.

func M512MaskCvtepi16Epi64

func M512MaskCvtepi16Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskCvtepi16Epi64: Sign extend packed 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm512_mask_cvtepi16_epi64'. Requires AVX512F.

func M512MaskCvtepi32Epi16

func M512MaskCvtepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm512_mask_cvtepi32_epi16'. Requires AVX512F.

func M512MaskCvtepi32Epi64

func M512MaskCvtepi32Epi64(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm512_mask_cvtepi32_epi64'. Requires AVX512F.

func M512MaskCvtepi32Epi8

func M512MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm512_mask_cvtepi32_epi8'. Requires AVX512F.

func M512MaskCvtepi32Pd

func M512MaskCvtepi32Pd(src x86.M512d, k x86.Mmask8, a x86.M256i) (dst x86.M512d)

M512MaskCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := src[m+63:m]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_mask_cvtepi32_pd'. Requires AVX512F.

func M512MaskCvtepi32Ps

func M512MaskCvtepi32Ps(src x86.M512, k x86.Mmask16, a x86.M512i) (dst x86.M512)

M512MaskCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_mask_cvtepi32_ps'. Requires AVX512F.

func M512MaskCvtepi64Epi16

func M512MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm512_mask_cvtepi64_epi16'. Requires AVX512F.

func M512MaskCvtepi64Epi32

func M512MaskCvtepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm512_mask_cvtepi64_epi32'. Requires AVX512F.

func M512MaskCvtepi64Epi8

func M512MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm512_mask_cvtepi64_epi8'. Requires AVX512F.

func M512MaskCvtepi8Epi32

func M512MaskCvtepi8Epi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskCvtepi8Epi32: Sign extend packed 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm512_mask_cvtepi8_epi32'. Requires AVX512F.

func M512MaskCvtepi8Epi64

func M512MaskCvtepi8Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskCvtepi8Epi64: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm512_mask_cvtepi8_epi64'. Requires AVX512F.

func M512MaskCvtepu16Epi32

func M512MaskCvtepu16Epi32(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm512_mask_cvtepu16_epi32'. Requires AVX512F.

func M512MaskCvtepu16Epi64

func M512MaskCvtepu16Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm512_mask_cvtepu16_epi64'. Requires AVX512F.

func M512MaskCvtepu32Epi64

func M512MaskCvtepu32Epi64(src x86.M512i, k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm512_mask_cvtepu32_epi64'. Requires AVX512F.

func M512MaskCvtepu32Pd

func M512MaskCvtepu32Pd(src x86.M512d, k x86.Mmask8, a x86.M256i) (dst x86.M512d)

M512MaskCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_mask_cvtepu32_pd'. Requires AVX512F.

func M512MaskCvtepu32Ps

func M512MaskCvtepu32Ps(src x86.M512, k x86.Mmask16, a x86.M512i) (dst x86.M512)

M512MaskCvtepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_mask_cvtepu32_ps'. Requires AVX512F.

func M512MaskCvtepu8Epi32

func M512MaskCvtepu8Epi32(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm512_mask_cvtepu8_epi32'. Requires AVX512F.

func M512MaskCvtepu8Epi64

func M512MaskCvtepu8Epi64(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm512_mask_cvtepu8_epi64'. Requires AVX512F.

func M512MaskCvtpdEpi32

func M512MaskCvtpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_mask_cvtpd_epi32'. Requires AVX512F.

func M512MaskCvtpdEpu32

func M512MaskCvtpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_mask_cvtpd_epu32'. Requires AVX512F.

func M512MaskCvtpdPs

func M512MaskCvtpdPs(src x86.M256, k x86.Mmask8, a x86.M512d) (dst x86.M256)

M512MaskCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_mask_cvtpd_ps'. Requires AVX512F.

func M512MaskCvtphPs

func M512MaskCvtphPs(src x86.M512, k x86.Mmask16, a x86.M256i) (dst x86.M512)

M512MaskCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_mask_cvtph_ps'. Requires AVX512F.

func M512MaskCvtpsEpi32

func M512MaskCvtpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_mask_cvtps_epi32'. Requires AVX512F.

func M512MaskCvtpsEpu32

func M512MaskCvtpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_mask_cvtps_epu32'. Requires AVX512F.

func M512MaskCvtpsPd

func M512MaskCvtpsPd(src x86.M512d, k x86.Mmask8, a x86.M256) (dst x86.M512d)

M512MaskCvtpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_mask_cvtps_pd'. Requires AVX512F.

func M512MaskCvtpsPh

func M512MaskCvtpsPh(src x86.M256i, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)

M512MaskCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_mask_cvtps_ph'. Requires AVX512F.

func M512MaskCvtsepi32Epi16

func M512MaskCvtsepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm512_mask_cvtsepi32_epi16'. Requires AVX512F.

func M512MaskCvtsepi32Epi8

func M512MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm512_mask_cvtsepi32_epi8'. Requires AVX512F.

func M512MaskCvtsepi64Epi16

func M512MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm512_mask_cvtsepi64_epi16'. Requires AVX512F.

func M512MaskCvtsepi64Epi32

func M512MaskCvtsepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm512_mask_cvtsepi64_epi32'. Requires AVX512F.

func M512MaskCvtsepi64Epi8

func M512MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm512_mask_cvtsepi64_epi8'. Requires AVX512F.

func M512MaskCvttRoundpdEpi32

func M512MaskCvttRoundpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)

M512MaskCvttRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := 32*i
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_mask_cvtt_roundpd_epi32'. Requires AVX512F.

func M512MaskCvttRoundpdEpu32

func M512MaskCvttRoundpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)

M512MaskCvttRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 32*i
		l := 64*j
		IF k[j]
			dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_mask_cvtt_roundpd_epu32'. Requires AVX512F.

func M512MaskCvttRoundpsEpi32

func M512MaskCvttRoundpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)

M512MaskCvttRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		IF k[j]
			dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_mask_cvtt_roundps_epi32'. Requires AVX512F.

func M512MaskCvttRoundpsEpu32

func M512MaskCvttRoundpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)

M512MaskCvttRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		IF k[j]
			dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_mask_cvtt_roundps_epu32'. Requires AVX512F.

func M512MaskCvttpdEpi32

func M512MaskCvttpdEpi32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_mask_cvttpd_epi32'. Requires AVX512F.

func M512MaskCvttpdEpu32

func M512MaskCvttpdEpu32(src x86.M256i, k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_mask_cvttpd_epu32'. Requires AVX512F.

func M512MaskCvttpsEpi32

func M512MaskCvttpsEpi32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_mask_cvttps_epi32'. Requires AVX512F.

func M512MaskCvttpsEpu32

func M512MaskCvttpsEpu32(src x86.M512i, k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_mask_cvttps_epu32'. Requires AVX512F.

func M512MaskCvtusepi32Epi16

func M512MaskCvtusepi32Epi16(src x86.M256i, k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm512_mask_cvtusepi32_epi16'. Requires AVX512F.

func M512MaskCvtusepi32Epi8

func M512MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm512_mask_cvtusepi32_epi8'. Requires AVX512F.

func M512MaskCvtusepi64Epi16

func M512MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm512_mask_cvtusepi64_epi16'. Requires AVX512F.

func M512MaskCvtusepi64Epi32

func M512MaskCvtusepi64Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm512_mask_cvtusepi64_epi32'. Requires AVX512F.

func M512MaskCvtusepi64Epi8

func M512MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm512_mask_cvtusepi64_epi8'. Requires AVX512F.

func M512MaskDivEpi32

func M512MaskDivEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskDivEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_div_epi32'. Requires AVX512F.

func M512MaskDivEpu32

func M512MaskDivEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskDivEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_div_epu32'. Requires AVX512F.

func M512MaskDivPd

func M512MaskDivPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_mask_div_pd'. Requires AVX512F.

func M512MaskDivPs

func M512MaskDivPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_mask_div_ps'. Requires AVX512F.

func M512MaskDivRoundPd

func M512MaskDivRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskDivRoundPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 64*j
			IF k[j]
				dst[i+63:i] := a[i+63:i] / b[i+63:i]
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_mask_div_round_pd'. Requires AVX512F.

func M512MaskDivRoundPs

func M512MaskDivRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskDivRoundPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := a[i+31:i] / b[i+31:i]
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_mask_div_round_ps'. Requires AVX512F.

func M512MaskErfPd

func M512MaskErfPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskErfPd: Compute the error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ERF(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erf_pd'. Requires AVX512F.

func M512MaskErfPs

func M512MaskErfPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskErfPs: Compute the error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ERF(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erf_ps'. Requires AVX512F.

func M512MaskErfcPd

func M512MaskErfcPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskErfcPd: Compute the complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 1.0 - ERF(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfc_pd'. Requires AVX512F.

func M512MaskErfcPs

func M512MaskErfcPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskErfcPs: Compute the complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 1.0 - ERF(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfc_ps'. Requires AVX512F.

func M512MaskErfcinvPd

func M512MaskErfcinvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskErfcinvPd: Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfcinv_pd'. Requires AVX512F.

func M512MaskErfcinvPs

func M512MaskErfcinvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskErfcinvPs: Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfcinv_ps'. Requires AVX512F.

func M512MaskErfinvPd

func M512MaskErfinvPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskErfinvPd: Compute the inverse error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 1.0 / ERF(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfinv_pd'. Requires AVX512F.

func M512MaskErfinvPs

func M512MaskErfinvPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskErfinvPs: Compute the inverse error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 1.0 / ERF(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_erfinv_ps'. Requires AVX512F.

func M512MaskExp10Pd

func M512MaskExp10Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskExp10Pd: Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 10^(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp10_pd'. Requires AVX512F.

func M512MaskExp10Ps

func M512MaskExp10Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskExp10Ps: Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 10^(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp10_ps'. Requires AVX512F.

func M512MaskExp2Pd

func M512MaskExp2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskExp2Pd: Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 2^(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp2_pd'. Requires AVX512F.

func M512MaskExp2Ps

func M512MaskExp2Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskExp2Ps: Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 2^(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp2_ps'. Requires AVX512F.

func M512MaskExpPd

func M512MaskExpPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskExpPd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := e^(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp_pd'. Requires AVX512F.

func M512MaskExpPs

func M512MaskExpPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskExpPs: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := e^(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_exp_ps'. Requires AVX512F.

func M512MaskExpandEpi32

func M512MaskExpandEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm512_mask_expand_epi32'. Requires AVX512F.

func M512MaskExpandEpi64

func M512MaskExpandEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm512_mask_expand_epi64'. Requires AVX512F.

func M512MaskExpandPd

func M512MaskExpandPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm512_mask_expand_pd'. Requires AVX512F.

func M512MaskExpandPs

func M512MaskExpandPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm512_mask_expand_ps'. Requires AVX512F.

func M512MaskExpm1Pd

func M512MaskExpm1Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskExpm1Pd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := e^(a[i+63:i]) - 1.0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_expm1_pd'. Requires AVX512F.

func M512MaskExpm1Ps

func M512MaskExpm1Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskExpm1Ps: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := e^(a[i+31:i]) - 1.0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_expm1_ps'. Requires AVX512F.

func M512MaskExtractf32x4Ps

func M512MaskExtractf32x4Ps(src x86.M128, k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M128)

M512MaskExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm512_mask_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskExtractf64x4Pd

func M512MaskExtractf64x4Pd(src x86.M256d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M256d)

M512MaskExtractf64x4Pd: Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTF64X4'. Intrinsic: '_mm512_mask_extractf64x4_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskExtracti32x4Epi32

func M512MaskExtracti32x4Epi32(src x86.M128i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)

M512MaskExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm512_mask_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskExtracti64x4Epi64

func M512MaskExtracti64x4Epi64(src x86.M256i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)

M512MaskExtracti64x4Epi64: Extract 256 bits (composed of 4 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTI64X4'. Intrinsic: '_mm512_mask_extracti64x4_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskFixupimmPd

func M512MaskFixupimmPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)

M512MaskFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_mask_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskFixupimmPs

func M512MaskFixupimmPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)

M512MaskFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_mask_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskFixupimmRoundPd

func M512MaskFixupimmRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)

M512MaskFixupimmRoundPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := a[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_mask_fixupimm_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskFixupimmRoundPs

func M512MaskFixupimmRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)

M512MaskFixupimmRoundPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := a[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_mask_fixupimm_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskFloorPd

func M512MaskFloorPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskFloorPd: Round the packed double-precision (64-bit) floating-point elements in 'a' down to an integer value, and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := FLOOR(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_floor_pd'. Requires AVX512F.

func M512MaskFloorPs

func M512MaskFloorPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskFloorPs: Round the packed single-precision (32-bit) floating-point elements in 'a' down to an integer value, and store the results as packed single-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FLOOR(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_floor_ps'. Requires AVX512F.

func M512MaskFmaddsubPd

func M512MaskFmaddsubPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask_fmaddsub_pd'. Requires AVX512F.

func M512MaskFmaddsubPs

func M512MaskFmaddsubPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask_fmaddsub_ps'. Requires AVX512F.

func M512MaskFmaddsubRoundPd

func M512MaskFmaddsubRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskFmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := a[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_mask_fmaddsub_round_pd'. Requires AVX512F.

func M512MaskFmaddsubRoundPs

func M512MaskFmaddsubRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskFmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := a[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_mask_fmaddsub_round_ps'. Requires AVX512F.

func M512MaskFmsubaddPd

func M512MaskFmsubaddPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask_fmsubadd_pd'. Requires AVX512F.

func M512MaskFmsubaddPs

func M512MaskFmsubaddPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask_fmsubadd_ps'. Requires AVX512F.

func M512MaskFmsubaddRoundPd

func M512MaskFmsubaddRoundPd(a x86.M512d, k x86.Mmask8, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskFmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := a[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_mask_fmsubadd_round_pd'. Requires AVX512F.

func M512MaskFmsubaddRoundPs

func M512MaskFmsubaddRoundPs(a x86.M512, k x86.Mmask16, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskFmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := a[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_mask_fmsubadd_round_ps'. Requires AVX512F.

func M512MaskHypotPd

func M512MaskHypotPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskHypotPd: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_hypot_pd'. Requires AVX512F.

func M512MaskHypotPs

func M512MaskHypotPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskHypotPs: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_hypot_ps'. Requires AVX512F.

func M512MaskInsertf32x4

func M512MaskInsertf32x4(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)

M512MaskInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm512_mask_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskInsertf64x4

func M512MaskInsertf64x4(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)

M512MaskInsertf64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF64X4'. Intrinsic: '_mm512_mask_insertf64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskInserti32x4

func M512MaskInserti32x4(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512MaskInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm512_mask_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskInserti64x4

func M512MaskInserti64x4(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512MaskInserti64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI64X4'. Intrinsic: '_mm512_mask_inserti64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskInvsqrtPd

func M512MaskInvsqrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskInvsqrtPd: Compute the inverse square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := InvSQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_invsqrt_pd'. Requires AVX512F.

func M512MaskInvsqrtPs

func M512MaskInvsqrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskInvsqrtPs: Compute the inverse square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := InvSQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_invsqrt_ps'. Requires AVX512F.

func M512MaskLog10Pd

func M512MaskLog10Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskLog10Pd: Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := log10(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log10_pd'. Requires AVX512F.

func M512MaskLog10Ps

func M512MaskLog10Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskLog10Ps: Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := log10(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log10_ps'. Requires AVX512F.

func M512MaskLog1pPd

func M512MaskLog1pPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskLog1pPd: Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ln(1.0 + a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log1p_pd'. Requires AVX512F.

func M512MaskLog1pPs

func M512MaskLog1pPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskLog1pPs: Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ln(1.0 + a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log1p_ps'. Requires AVX512F.

func M512MaskLog2Pd

func M512MaskLog2Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskLog2Pd: Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := log2(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log2_pd'. Requires AVX512F.

func M512MaskLogPd

func M512MaskLogPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskLogPd: Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ln(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log_pd'. Requires AVX512F.

func M512MaskLogPs

func M512MaskLogPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskLogPs: Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ln(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_log_ps'. Requires AVX512F.

func M512MaskLogbPd

func M512MaskLogbPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskLogbPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_logb_pd'. Requires AVX512F.

func M512MaskLogbPs

func M512MaskLogbPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskLogbPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_logb_ps'. Requires AVX512F.

func M512MaskMaxEpi64

func M512MaskMaxEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm512_mask_max_epi64'. Requires AVX512F.

func M512MaskMaxEpu64

func M512MaskMaxEpu64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm512_mask_max_epu64'. Requires AVX512F.

func M512MaskMaxPd

func M512MaskMaxPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_mask_max_pd'. Requires AVX512F.

func M512MaskMaxPs

func M512MaskMaxPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_mask_max_ps'. Requires AVX512F.

func M512MaskMaxRoundPd

func M512MaskMaxRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MaskMaxRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_mask_max_round_pd'. Requires AVX512F.

func M512MaskMaxRoundPs

func M512MaskMaxRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MaskMaxRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		IF k[j]
			dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_mask_max_round_ps'. Requires AVX512F.

func M512MaskMinEpi64

func M512MaskMinEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm512_mask_min_epi64'. Requires AVX512F.

func M512MaskMinEpu64

func M512MaskMinEpu64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm512_mask_min_epu64'. Requires AVX512F.

func M512MaskMinPd

func M512MaskMinPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_mask_min_pd'. Requires AVX512F.

func M512MaskMinPs

func M512MaskMinPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_mask_min_ps'. Requires AVX512F.

func M512MaskMinRoundPd

func M512MaskMinRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MaskMinRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_mask_min_round_pd'. Requires AVX512F.

func M512MaskMinRoundPs

func M512MaskMinRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MaskMinRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		IF k[j]
			dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
		ELSE
			dst[i+31:i] := src[i+31:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_mask_min_round_ps'. Requires AVX512F.

func M512MaskMovedupPd

func M512MaskMovedupPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
tmp[319:256] := a[319:256]
tmp[383:320] := a[319:256]
tmp[447:384] := a[447:384]
tmp[511:448] := a[447:384]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm512_mask_movedup_pd'. Requires AVX512F.

func M512MaskMovehdupPs

func M512MaskMovehdupPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
tmp[287:256] := a[319:288]
tmp[319:288] := a[319:288]
tmp[351:320] := a[383:352]
tmp[383:352] := a[383:352]
tmp[415:384] := a[447:416]
tmp[447:416] := a[447:416]
tmp[479:448] := a[511:480]
tmp[511:480] := a[511:480]
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm512_mask_movehdup_ps'. Requires AVX512F.

func M512MaskMoveldupPs

func M512MaskMoveldupPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
tmp[287:256] := a[287:256]
tmp[319:288] := a[287:256]
tmp[351:320] := a[351:320]
tmp[383:352] := a[351:320]
tmp[415:384] := a[415:384]
tmp[447:416] := a[415:384]
tmp[479:448] := a[479:448]
tmp[511:480] := a[479:448]
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm512_mask_moveldup_ps'. Requires AVX512F.

func M512MaskMulEpi32

func M512MaskMulEpi32(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm512_mask_mul_epi32'. Requires AVX512F.

func M512MaskMulEpu32

func M512MaskMulEpu32(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm512_mask_mul_epu32'. Requires AVX512F.

func M512MaskMulloxEpi64

func M512MaskMulloxEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulloxEpi64: Multiplies elements in packed 64-bit integer vectors 'a' and 'b' together, storing the lower 64 bits of the result in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_mullox_epi64'. Requires AVX512F.

func M512MaskNearbyintPd

func M512MaskNearbyintPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskNearbyintPd: Rounds each packed double-precision (64-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := NearbyInt(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_nearbyint_pd'. Requires AVX512F.

func M512MaskNearbyintPs

func M512MaskNearbyintPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskNearbyintPs: Rounds each packed single-precision (32-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := NearbyInt(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_nearbyint_ps'. Requires AVX512F.

func M512MaskPermutePd

func M512MaskPermutePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]
IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]
IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]
IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]
IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]
IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]
IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]
IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_mask_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskPermutePs

func M512MaskPermutePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_mask_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskPermutevarPd

func M512MaskPermutevarPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512i) (dst x86.M512d)

M512MaskPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
IF (b[257] == 0) tmp_dst[319:256] := a[319:256]
IF (b[257] == 1) tmp_dst[319:256] := a[383:320]
IF (b[321] == 0) tmp_dst[383:320] := a[319:256]
IF (b[321] == 1) tmp_dst[383:320] := a[383:320]
IF (b[385] == 0) tmp_dst[447:384] := a[447:384]
IF (b[385] == 1) tmp_dst[447:384] := a[511:448]
IF (b[449] == 0) tmp_dst[511:448] := a[447:384]
IF (b[449] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_mask_permutevar_pd'. Requires AVX512F.

func M512MaskPermutevarPs

func M512MaskPermutevarPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512i) (dst x86.M512)

M512MaskPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_mask_permutevar_ps'. Requires AVX512F.

func M512MaskPermutex2varEpi32

func M512MaskPermutex2varEpi32(a x86.M512i, k x86.Mmask16, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMT2D'. Intrinsic: '_mm512_mask_permutex2var_epi32'. Requires AVX512F.

func M512MaskPermutex2varEpi64

func M512MaskPermutex2varEpi64(a x86.M512i, k x86.Mmask8, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMT2Q'. Intrinsic: '_mm512_mask_permutex2var_epi64'. Requires AVX512F.

func M512MaskPermutex2varPd

func M512MaskPermutex2varPd(a x86.M512d, k x86.Mmask8, idx x86.M512i, b x86.M512d) (dst x86.M512d)

M512MaskPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMT2PD'. Intrinsic: '_mm512_mask_permutex2var_pd'. Requires AVX512F.

func M512MaskPermutex2varPs

func M512MaskPermutex2varPs(a x86.M512, k x86.Mmask16, idx x86.M512i, b x86.M512) (dst x86.M512)

M512MaskPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMT2PS'. Intrinsic: '_mm512_mask_permutex2var_ps'. Requires AVX512F.

func M512MaskPermutexEpi64

func M512MaskPermutexEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskPermutexEpi64: Shuffle 64-bit integers in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_mask_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskPermutexPd

func M512MaskPermutexPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_mask_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskPermutexvarEpi32

func M512MaskPermutexvarEpi32(src x86.M512i, k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm512_mask_permutexvar_epi32'. Requires AVX512F.

func M512MaskPermutexvarEpi64

func M512MaskPermutexvarEpi64(src x86.M512i, k x86.Mmask8, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_mask_permutexvar_epi64'. Requires AVX512F.

func M512MaskPermutexvarPd

func M512MaskPermutexvarPd(src x86.M512d, k x86.Mmask8, idx x86.M512i, a x86.M512d) (dst x86.M512d)

M512MaskPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_mask_permutexvar_pd'. Requires AVX512F.

func M512MaskPermutexvarPs

func M512MaskPermutexvarPs(src x86.M512, k x86.Mmask16, idx x86.M512i, a x86.M512) (dst x86.M512)

M512MaskPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm512_mask_permutexvar_ps'. Requires AVX512F.

func M512MaskPowPd

func M512MaskPowPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskPowPd: Compute the exponential value of packed double-precision (64-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_pow_pd'. Requires AVX512F.

func M512MaskPowPs

func M512MaskPowPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskPowPs: Compute the exponential value of packed single-precision (32-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_pow_ps'. Requires AVX512F.

func M512MaskRcp14Pd

func M512MaskRcp14Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm512_mask_rcp14_pd'. Requires AVX512F.

func M512MaskRcp14Ps

func M512MaskRcp14Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm512_mask_rcp14_ps'. Requires AVX512F.

func M512MaskRecipPd

func M512MaskRecipPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskRecipPd: Computes the reciprocal of packed double-precision (64-bit) floating-point elements in 'a', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (1 / a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_recip_pd'. Requires AVX512F.

func M512MaskRecipPs

func M512MaskRecipPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskRecipPs: Computes the reciprocal of packed single-precision (32-bit) floating-point elements in 'a', storing the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (1 / a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_recip_ps'. Requires AVX512F.

func M512MaskRemEpi32

func M512MaskRemEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRemEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_rem_epi32'. Requires AVX512F.

func M512MaskRemEpu32

func M512MaskRemEpu32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRemEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_rem_epu32'. Requires AVX512F.

func M512MaskRintPd

func M512MaskRintPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskRintPd: Rounds the packed double-precision (64-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundToNearestEven(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_rint_pd'. Requires AVX512F.

func M512MaskRintPs

func M512MaskRintPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskRintPs: Rounds the packed single-precision (32-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundToNearestEven(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_rint_ps'. Requires AVX512F.

func M512MaskRolEpi32

func M512MaskRolEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm512_mask_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRolEpi64

func M512MaskRolEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm512_mask_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRolvEpi32

func M512MaskRolvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm512_mask_rolv_epi32'. Requires AVX512F.

func M512MaskRolvEpi64

func M512MaskRolvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm512_mask_rolv_epi64'. Requires AVX512F.

func M512MaskRorEpi32

func M512MaskRorEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm512_mask_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRorEpi64

func M512MaskRorEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm512_mask_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRorvEpi32

func M512MaskRorvEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm512_mask_rorv_epi32'. Requires AVX512F.

func M512MaskRorvEpi64

func M512MaskRorvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm512_mask_rorv_epi64'. Requires AVX512F.

func M512MaskRoundscalePd

func M512MaskRoundscalePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_mask_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRoundscalePs

func M512MaskRoundscalePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_mask_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRoundscaleRoundPd

func M512MaskRoundscaleRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskRoundscaleRoundPd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_mask_roundscale_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRoundscaleRoundPs

func M512MaskRoundscaleRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskRoundscaleRoundPs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_mask_roundscale_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskRsqrt14Pd

func M512MaskRsqrt14Pd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm512_mask_rsqrt14_pd'. Requires AVX512F.

func M512MaskRsqrt14Ps

func M512MaskRsqrt14Ps(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm512_mask_rsqrt14_ps'. Requires AVX512F.

func M512MaskScalefPd

func M512MaskScalefPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_mask_scalef_pd'. Requires AVX512F.

func M512MaskScalefPs

func M512MaskScalefPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_mask_scalef_ps'. Requires AVX512F.

func M512MaskScalefRoundPd

func M512MaskScalefRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskScalefRoundPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_mask_scalef_round_pd'. Requires AVX512F.

func M512MaskScalefRoundPs

func M512MaskScalefRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskScalefRoundPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_mask_scalef_round_ps'. Requires AVX512F.

func M512MaskSet1Epi32

func M512MaskSet1Epi32(src x86.M512i, k x86.Mmask16, a int) (dst x86.M512i)

M512MaskSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_mask_set1_epi32'. Requires AVX512F.

func M512MaskSet1Epi64

func M512MaskSet1Epi64(src x86.M512i, k x86.Mmask8, a int64) (dst x86.M512i)

M512MaskSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_mask_set1_epi64'. Requires AVX512F.

func M512MaskShuffleF32x4

func M512MaskShuffleF32x4(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm512_mask_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskShuffleF64x2

func M512MaskShuffleF64x2(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm512_mask_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskShuffleI32x4

func M512MaskShuffleI32x4(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm512_mask_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskShuffleI64x2

func M512MaskShuffleI64x2(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm512_mask_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskShufflePd

func M512MaskShufflePd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm512_mask_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskShufflePs

func M512MaskShufflePs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm512_mask_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskSinPd

func M512MaskSinPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSinPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SIN(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sin_pd'. Requires AVX512F.

func M512MaskSinPs

func M512MaskSinPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskSinPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SIN(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sin_ps'. Requires AVX512F.

func M512MaskSincosPd

func M512MaskSincosPd(cos_res *x86.M512d, sin_src x86.M512d, cos_src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSincosPd: Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'. Elements are written to their respective locations using writemask 'k' (elements are copied from 'sin_src' or 'cos_src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SIN(a[i+63:i])
		cos_res[i+63:i] := COS(a[i+63:i])
	ELSE
		dst[i+63:i] := sin_src[i+63:i]
		cos_res[i+63:i] := cos_src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sincos_pd'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskSincosPs

func M512MaskSincosPs(cos_res *x86.M512, sin_src x86.M512, cos_src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskSincosPs: Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'. Elements are written to their respective locations using writemask 'k' (elements are copied from 'sin_src' or 'cos_src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SIN(a[i+31:i])
		cos_res[i+31:i] := COS(a[i+31:i])
	ELSE
		dst[i+31:i] := sin_src[i+31:i]
		cos_res[i+31:i] := cos_src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sincos_ps'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512MaskSindPd

func M512MaskSindPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSindPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SIND(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sind_pd'. Requires AVX512F.

func M512MaskSindPs

func M512MaskSindPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskSindPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SIND(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sind_ps'. Requires AVX512F.

func M512MaskSinhPd

func M512MaskSinhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSinhPd: Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SINH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sinh_pd'. Requires AVX512F.

func M512MaskSinhPs

func M512MaskSinhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskSinhPs: Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SINH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_sinh_ps'. Requires AVX512F.

func M512MaskSllEpi32

func M512MaskSllEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm512_mask_sll_epi32'. Requires AVX512F.

func M512MaskSllEpi64

func M512MaskSllEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_mask_sll_epi64'. Requires AVX512F.

func M512MaskSlliEpi64

func M512MaskSlliEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_mask_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskSllvEpi64

func M512MaskSllvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm512_mask_sllv_epi64'. Requires AVX512F.

func M512MaskSqrtPd

func M512MaskSqrtPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_mask_sqrt_pd'. Requires AVX512F.

func M512MaskSqrtPs

func M512MaskSqrtPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_mask_sqrt_ps'. Requires AVX512F.

func M512MaskSqrtRoundPd

func M512MaskSqrtRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)

M512MaskSqrtRoundPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := SQRT(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_mask_sqrt_round_pd'. Requires AVX512F.

func M512MaskSqrtRoundPs

func M512MaskSqrtRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)

M512MaskSqrtRoundPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := SQRT(a[i+31:i])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_mask_sqrt_round_ps'. Requires AVX512F.

func M512MaskSraEpi32

func M512MaskSraEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm512_mask_sra_epi32'. Requires AVX512F.

func M512MaskSraEpi64

func M512MaskSraEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_mask_sra_epi64'. Requires AVX512F.

func M512MaskSraiEpi64

func M512MaskSraiEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_mask_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskSravEpi64

func M512MaskSravEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm512_mask_srav_epi64'. Requires AVX512F.

func M512MaskSrlEpi32

func M512MaskSrlEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm512_mask_srl_epi32'. Requires AVX512F.

func M512MaskSrlEpi64

func M512MaskSrlEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_mask_srl_epi64'. Requires AVX512F.

func M512MaskSrliEpi64

func M512MaskSrliEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_mask_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskSrlvEpi64

func M512MaskSrlvEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm512_mask_srlv_epi64'. Requires AVX512F.

func M512MaskSubEpi64

func M512MaskSubEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm512_mask_sub_epi64'. Requires AVX512F.

func M512MaskSvmlRoundPd

func M512MaskSvmlRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskSvmlRoundPd: Round the packed double-precision (64-bit) floating-point elements in 'a' to the nearest integer value, and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ROUND(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_svml_round_pd'. Requires AVX512F.

func M512MaskTanPd

func M512MaskTanPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskTanPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := TAN(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tan_pd'. Requires AVX512F.

func M512MaskTanPs

func M512MaskTanPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskTanPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := TAN(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tan_ps'. Requires AVX512F.

func M512MaskTandPd

func M512MaskTandPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskTandPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := TAND(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tand_pd'. Requires AVX512F.

func M512MaskTandPs

func M512MaskTandPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskTandPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := TAND(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tand_ps'. Requires AVX512F.

func M512MaskTanhPd

func M512MaskTanhPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskTanhPd: Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := TANH(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tanh_pd'. Requires AVX512F.

func M512MaskTanhPs

func M512MaskTanhPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskTanhPs: Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := TANH(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_tanh_ps'. Requires AVX512F.

func M512MaskTernarylogicEpi32

func M512MaskTernarylogicEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 32-bit granularity (32-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm512_mask_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskTernarylogicEpi64

func M512MaskTernarylogicEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 64-bit granularity (64-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm512_mask_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskTestEpi64Mask

func M512MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskTestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm512_mask_test_epi64_mask'. Requires AVX512F.

func M512MaskTestnEpi32Mask

func M512MaskTestnEpi32Mask(k1 x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512MaskTestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 15
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm512_mask_testn_epi32_mask'. Requires AVX512F.

func M512MaskTestnEpi64Mask

func M512MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512MaskTestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 7
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm512_mask_testn_epi64_mask'. Requires AVX512F.

func M512MaskTruncPd

func M512MaskTruncPd(src x86.M512d, k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskTruncPd: Truncate the packed double-precision (64-bit) floating-point elements in 'a', and store the results as packed double-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := TRUNCATE(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_trunc_pd'. Requires AVX512F.

func M512MaskTruncPs

func M512MaskTruncPs(src x86.M512, k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskTruncPs: Truncate the packed single-precision (32-bit) floating-point elements in 'a', and store the results as packed single-precision floating-point elements in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := TRUNCATE(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mask_trunc_ps'. Requires AVX512F.

func M512MaskUnpackhiEpi32

func M512MaskUnpackhiEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm512_mask_unpackhi_epi32'. Requires AVX512F.

func M512MaskUnpackhiEpi64

func M512MaskUnpackhiEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm512_mask_unpackhi_epi64'. Requires AVX512F.

func M512MaskUnpackhiPd

func M512MaskUnpackhiPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm512_mask_unpackhi_pd'. Requires AVX512F.

func M512MaskUnpackhiPs

func M512MaskUnpackhiPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm512_mask_unpackhi_ps'. Requires AVX512F.

func M512MaskUnpackloEpi32

func M512MaskUnpackloEpi32(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm512_mask_unpacklo_epi32'. Requires AVX512F.

func M512MaskUnpackloEpi64

func M512MaskUnpackloEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm512_mask_unpacklo_epi64'. Requires AVX512F.

func M512MaskUnpackloPd

func M512MaskUnpackloPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm512_mask_unpacklo_pd'. Requires AVX512F.

func M512MaskUnpackloPs

func M512MaskUnpackloPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm512_mask_unpacklo_ps'. Requires AVX512F.

func M512MaskzAbsEpi32

func M512MaskzAbsEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskzAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm512_maskz_abs_epi32'. Requires AVX512F.

func M512MaskzAbsEpi64

func M512MaskzAbsEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskzAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm512_maskz_abs_epi64'. Requires AVX512F.

func M512MaskzAddEpi32

func M512MaskzAddEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm512_maskz_add_epi32'. Requires AVX512F.

func M512MaskzAddEpi64

func M512MaskzAddEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm512_maskz_add_epi64'. Requires AVX512F.

func M512MaskzAddPd

func M512MaskzAddPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzAddPd: Add packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDPD'. Intrinsic: '_mm512_maskz_add_pd'. Requires AVX512F.

func M512MaskzAddPs

func M512MaskzAddPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzAddPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VADDPS'. Intrinsic: '_mm512_maskz_add_ps'. Requires AVX512F.

func M512MaskzAddRoundPd

func M512MaskzAddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskzAddRoundPd: Add packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := a[i+63:i] + b[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDPD'. Intrinsic: '_mm512_maskz_add_round_pd'. Requires AVX512F.

func M512MaskzAddRoundPs

func M512MaskzAddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskzAddRoundPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := a[i+31:i] + b[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VADDPS'. Intrinsic: '_mm512_maskz_add_round_ps'. Requires AVX512F.

func M512MaskzAlignrEpi32

func M512MaskzAlignrEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskzAlignrEpi32: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 32-bit elements, and stores the low 64 bytes (16 elements) in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (32*count)
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := temp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VALIGND'. Intrinsic: '_mm512_maskz_alignr_epi32'. Requires AVX512F.

func M512MaskzAlignrEpi64

func M512MaskzAlignrEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskzAlignrEpi64: Concatenate 'a' and 'b' into a 128-byte immediate result, shift the result right by 'count' 64-bit elements, and stores the low 64 bytes (8 elements) in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (64*count)
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := temp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VALIGNQ'. Intrinsic: '_mm512_maskz_alignr_epi64'. Requires AVX512F.

func M512MaskzAndEpi32

func M512MaskzAndEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm512_maskz_and_epi32'. Requires AVX512F.

func M512MaskzAndEpi64

func M512MaskzAndEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm512_maskz_and_epi64'. Requires AVX512F.

func M512MaskzAndnotEpi32

func M512MaskzAndnotEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm512_maskz_andnot_epi32'. Requires AVX512F.

func M512MaskzAndnotEpi64

func M512MaskzAndnotEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm512_maskz_andnot_epi64'. Requires AVX512F.

func M512MaskzBroadcastF32x4

func M512MaskzBroadcastF32x4(k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskzBroadcastF32x4: Broadcast the 4 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X4'. Intrinsic: '_mm512_maskz_broadcast_f32x4'. Requires AVX512F.

func M512MaskzBroadcastF64x4

func M512MaskzBroadcastF64x4(k x86.Mmask8, a x86.M256d) (dst x86.M512d)

M512MaskzBroadcastF64x4: Broadcast the 4 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X4'. Intrinsic: '_mm512_maskz_broadcast_f64x4'. Requires AVX512F.

func M512MaskzBroadcastI32x4

func M512MaskzBroadcastI32x4(k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastI32x4: Broadcast the 4 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 4)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X4'. Intrinsic: '_mm512_maskz_broadcast_i32x4'. Requires AVX512F.

func M512MaskzBroadcastI64x4

func M512MaskzBroadcastI64x4(k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskzBroadcastI64x4: Broadcast the 4 packed 64-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 4)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X4'. Intrinsic: '_mm512_maskz_broadcast_i64x4'. Requires AVX512F.

func M512MaskzBroadcastdEpi32

func M512MaskzBroadcastdEpi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_maskz_broadcastd_epi32'. Requires AVX512F.

func M512MaskzBroadcastqEpi64

func M512MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_maskz_broadcastq_epi64'. Requires AVX512F.

func M512MaskzBroadcastsdPd

func M512MaskzBroadcastsdPd(k x86.Mmask8, a x86.M128d) (dst x86.M512d)

M512MaskzBroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm512_maskz_broadcastsd_pd'. Requires AVX512F.

func M512MaskzBroadcastssPs

func M512MaskzBroadcastssPs(k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskzBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm512_maskz_broadcastss_ps'. Requires AVX512F.

func M512MaskzCompressEpi32

func M512MaskzCompressEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskzCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm512_maskz_compress_epi32'. Requires AVX512F.

func M512MaskzCompressEpi64

func M512MaskzCompressEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskzCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm512_maskz_compress_epi64'. Requires AVX512F.

func M512MaskzCompressPd

func M512MaskzCompressPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm512_maskz_compress_pd'. Requires AVX512F.

func M512MaskzCompressPs

func M512MaskzCompressPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm512_maskz_compress_ps'. Requires AVX512F.

func M512MaskzCvtRoundepi32Ps

func M512MaskzCvtRoundepi32Ps(k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)

M512MaskzCvtRoundepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_maskz_cvt_roundepi32_ps'. Requires AVX512F.

func M512MaskzCvtRoundepu32Ps

func M512MaskzCvtRoundepu32Ps(k x86.Mmask16, a x86.M512i, rounding int) (dst x86.M512)

M512MaskzCvtRoundepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_maskz_cvt_roundepu32_ps'. Requires AVX512F.

func M512MaskzCvtRoundpdEpi32

func M512MaskzCvtRoundpdEpi32(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)

M512MaskzCvtRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 32*j
			l := 64*j
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_maskz_cvt_roundpd_epi32'. Requires AVX512F.

func M512MaskzCvtRoundpdEpu32

func M512MaskzCvtRoundpdEpu32(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256i)

M512MaskzCvtRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 32*j
			l := 64*j
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_maskz_cvt_roundpd_epu32'. Requires AVX512F.

func M512MaskzCvtRoundpdPs

func M512MaskzCvtRoundpdPs(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M256)

M512MaskzCvtRoundpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*32
			l := j*64
			IF k[j]
				dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_maskz_cvt_roundpd_ps'. Requires AVX512F.

func M512MaskzCvtRoundphPs

func M512MaskzCvtRoundphPs(k x86.Mmask16, a x86.M256i, sae int) (dst x86.M512)

M512MaskzCvtRoundphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		m := j*16
		IF k[j]
			dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_maskz_cvt_roundph_ps'. Requires AVX512F.

func M512MaskzCvtRoundpsEpi32

func M512MaskzCvtRoundpsEpi32(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_maskz_cvt_roundps_epi32'. Requires AVX512F.

func M512MaskzCvtRoundpsEpu32

func M512MaskzCvtRoundpsEpu32(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_maskz_cvt_roundps_epu32'. Requires AVX512F.

func M512MaskzCvtRoundpsPd

func M512MaskzCvtRoundpsPd(k x86.Mmask8, a x86.M256, sae int) (dst x86.M512d)

M512MaskzCvtRoundpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 64*j
		l := 32*j
		IF k[j]
			dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
		ELSE
			dst[i+63:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_maskz_cvt_roundps_pd'. Requires AVX512F.

func M512MaskzCvtRoundpsPh

func M512MaskzCvtRoundpsPh(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)

M512MaskzCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_maskz_cvt_roundps_ph'. Requires AVX512F.

func M512MaskzCvtepi16Epi32

func M512MaskzCvtepi16Epi32(k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm512_maskz_cvtepi16_epi32'. Requires AVX512F.

func M512MaskzCvtepi16Epi64

func M512MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepi16Epi64: Sign extend packed 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm512_maskz_cvtepi16_epi64'. Requires AVX512F.

func M512MaskzCvtepi32Epi16

func M512MaskzCvtepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskzCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm512_maskz_cvtepi32_epi16'. Requires AVX512F.

func M512MaskzCvtepi32Epi64

func M512MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm512_maskz_cvtepi32_epi64'. Requires AVX512F.

func M512MaskzCvtepi32Epi8

func M512MaskzCvtepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskzCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm512_maskz_cvtepi32_epi8'. Requires AVX512F.

func M512MaskzCvtepi32Pd

func M512MaskzCvtepi32Pd(k x86.Mmask8, a x86.M256i) (dst x86.M512d)

M512MaskzCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm512_maskz_cvtepi32_pd'. Requires AVX512F.

func M512MaskzCvtepi32Ps

func M512MaskzCvtepi32Ps(k x86.Mmask16, a x86.M512i) (dst x86.M512)

M512MaskzCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm512_maskz_cvtepi32_ps'. Requires AVX512F.

func M512MaskzCvtepi64Epi16

func M512MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm512_maskz_cvtepi64_epi16'. Requires AVX512F.

func M512MaskzCvtepi64Epi32

func M512MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskzCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm512_maskz_cvtepi64_epi32'. Requires AVX512F.

func M512MaskzCvtepi64Epi8

func M512MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm512_maskz_cvtepi64_epi8'. Requires AVX512F.

func M512MaskzCvtepi8Epi32

func M512MaskzCvtepi8Epi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepi8Epi32: Sign extend packed 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm512_maskz_cvtepi8_epi32'. Requires AVX512F.

func M512MaskzCvtepi8Epi64

func M512MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepi8Epi64: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm512_maskz_cvtepi8_epi64'. Requires AVX512F.

func M512MaskzCvtepu16Epi32

func M512MaskzCvtepu16Epi32(k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm512_maskz_cvtepu16_epi32'. Requires AVX512F.

func M512MaskzCvtepu16Epi64

func M512MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm512_maskz_cvtepu16_epi64'. Requires AVX512F.

func M512MaskzCvtepu32Epi64

func M512MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm512_maskz_cvtepu32_epi64'. Requires AVX512F.

func M512MaskzCvtepu32Pd

func M512MaskzCvtepu32Pd(k x86.Mmask8, a x86.M256i) (dst x86.M512d)

M512MaskzCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm512_maskz_cvtepu32_pd'. Requires AVX512F.

func M512MaskzCvtepu32Ps

func M512MaskzCvtepu32Ps(k x86.Mmask16, a x86.M512i) (dst x86.M512)

M512MaskzCvtepu32Ps: Convert packed unsigned 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUDQ2PS'. Intrinsic: '_mm512_maskz_cvtepu32_ps'. Requires AVX512F.

func M512MaskzCvtepu8Epi32

func M512MaskzCvtepu8Epi32(k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm512_maskz_cvtepu8_epi32'. Requires AVX512F.

func M512MaskzCvtepu8Epi64

func M512MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm512_maskz_cvtepu8_epi64'. Requires AVX512F.

func M512MaskzCvtpdEpi32

func M512MaskzCvtpdEpi32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskzCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm512_maskz_cvtpd_epi32'. Requires AVX512F.

func M512MaskzCvtpdEpu32

func M512MaskzCvtpdEpu32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskzCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm512_maskz_cvtpd_epu32'. Requires AVX512F.

func M512MaskzCvtpdPs

func M512MaskzCvtpdPs(k x86.Mmask8, a x86.M512d) (dst x86.M256)

M512MaskzCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm512_maskz_cvtpd_ps'. Requires AVX512F.

func M512MaskzCvtphPs

func M512MaskzCvtphPs(k x86.Mmask16, a x86.M256i) (dst x86.M512)

M512MaskzCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm512_maskz_cvtph_ps'. Requires AVX512F.

func M512MaskzCvtpsEpi32

func M512MaskzCvtpsEpi32(k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskzCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm512_maskz_cvtps_epi32'. Requires AVX512F.

func M512MaskzCvtpsEpu32

func M512MaskzCvtpsEpu32(k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskzCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm512_maskz_cvtps_epu32'. Requires AVX512F.

func M512MaskzCvtpsPd

func M512MaskzCvtpsPd(k x86.Mmask8, a x86.M256) (dst x86.M512d)

M512MaskzCvtpsPd: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2PD'. Intrinsic: '_mm512_maskz_cvtps_pd'. Requires AVX512F.

func M512MaskzCvtpsPh

func M512MaskzCvtpsPh(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M256i)

M512MaskzCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm512_maskz_cvtps_ph'. Requires AVX512F.

func M512MaskzCvtsepi32Epi16

func M512MaskzCvtsepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskzCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm512_maskz_cvtsepi32_epi16'. Requires AVX512F.

func M512MaskzCvtsepi32Epi8

func M512MaskzCvtsepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskzCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm512_maskz_cvtsepi32_epi8'. Requires AVX512F.

func M512MaskzCvtsepi64Epi16

func M512MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm512_maskz_cvtsepi64_epi16'. Requires AVX512F.

func M512MaskzCvtsepi64Epi32

func M512MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskzCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm512_maskz_cvtsepi64_epi32'. Requires AVX512F.

func M512MaskzCvtsepi64Epi8

func M512MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm512_maskz_cvtsepi64_epi8'. Requires AVX512F.

func M512MaskzCvttRoundpdEpi32

func M512MaskzCvttRoundpdEpi32(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)

M512MaskzCvttRoundpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 32*i
		l := 64*j
		IF k[j]
			dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_maskz_cvtt_roundpd_epi32'. Requires AVX512F.

func M512MaskzCvttRoundpdEpu32

func M512MaskzCvttRoundpdEpu32(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M256i)

M512MaskzCvttRoundpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := 32*i
		l := 64*j
		IF k[j]
			dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_maskz_cvtt_roundpd_epu32'. Requires AVX512F.

func M512MaskzCvttRoundpsEpi32

func M512MaskzCvttRoundpsEpi32(k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)

M512MaskzCvttRoundpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		IF k[j]
			dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_maskz_cvtt_roundps_epi32'. Requires AVX512F.

func M512MaskzCvttRoundpsEpu32

func M512MaskzCvttRoundpsEpu32(k x86.Mmask16, a x86.M512, sae int) (dst x86.M512i)

M512MaskzCvttRoundpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := 32*i
		IF k[j]
			dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_maskz_cvtt_roundps_epu32'. Requires AVX512F.

func M512MaskzCvttpdEpi32

func M512MaskzCvttpdEpi32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskzCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm512_maskz_cvttpd_epi32'. Requires AVX512F.

func M512MaskzCvttpdEpu32

func M512MaskzCvttpdEpu32(k x86.Mmask8, a x86.M512d) (dst x86.M256i)

M512MaskzCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm512_maskz_cvttpd_epu32'. Requires AVX512F.

func M512MaskzCvttpsEpi32

func M512MaskzCvttpsEpi32(k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskzCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm512_maskz_cvttps_epi32'. Requires AVX512F.

func M512MaskzCvttpsEpu32

func M512MaskzCvttpsEpu32(k x86.Mmask16, a x86.M512) (dst x86.M512i)

M512MaskzCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm512_maskz_cvttps_epu32'. Requires AVX512F.

func M512MaskzCvtusepi32Epi16

func M512MaskzCvtusepi32Epi16(k x86.Mmask16, a x86.M512i) (dst x86.M256i)

M512MaskzCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm512_maskz_cvtusepi32_epi16'. Requires AVX512F.

func M512MaskzCvtusepi32Epi8

func M512MaskzCvtusepi32Epi8(k x86.Mmask16, a x86.M512i) (dst x86.M128i)

M512MaskzCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm512_maskz_cvtusepi32_epi8'. Requires AVX512F.

func M512MaskzCvtusepi64Epi16

func M512MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm512_maskz_cvtusepi64_epi16'. Requires AVX512F.

func M512MaskzCvtusepi64Epi32

func M512MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M512i) (dst x86.M256i)

M512MaskzCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm512_maskz_cvtusepi64_epi32'. Requires AVX512F.

func M512MaskzCvtusepi64Epi8

func M512MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M512i) (dst x86.M128i)

M512MaskzCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm512_maskz_cvtusepi64_epi8'. Requires AVX512F.

func M512MaskzDivPd

func M512MaskzDivPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_maskz_div_pd'. Requires AVX512F.

func M512MaskzDivPs

func M512MaskzDivPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_maskz_div_ps'. Requires AVX512F.

func M512MaskzDivRoundPd

func M512MaskzDivRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskzDivRoundPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := 64*j
			IF k[j]
				dst[i+63:i] := a[i+63:i] / b[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm512_maskz_div_round_pd'. Requires AVX512F.

func M512MaskzDivRoundPs

func M512MaskzDivRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskzDivRoundPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := 32*j
			IF k[j]
				dst[i+31:i] := a[i+31:i] / b[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm512_maskz_div_round_ps'. Requires AVX512F.

func M512MaskzExpandEpi32

func M512MaskzExpandEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskzExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm512_maskz_expand_epi32'. Requires AVX512F.

func M512MaskzExpandEpi64

func M512MaskzExpandEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskzExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm512_maskz_expand_epi64'. Requires AVX512F.

func M512MaskzExpandPd

func M512MaskzExpandPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm512_maskz_expand_pd'. Requires AVX512F.

func M512MaskzExpandPs

func M512MaskzExpandPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm512_maskz_expand_ps'. Requires AVX512F.

func M512MaskzExtractf32x4Ps

func M512MaskzExtractf32x4Ps(k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M128)

M512MaskzExtractf32x4Ps: Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF32X4'. Intrinsic: '_mm512_maskz_extractf32x4_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtractf64x4Pd

func M512MaskzExtractf64x4Pd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M256d)

M512MaskzExtractf64x4Pd: Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTF64X4'. Intrinsic: '_mm512_maskz_extractf64x4_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtracti32x4Epi32

func M512MaskzExtracti32x4Epi32(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)

M512MaskzExtracti32x4Epi32: Extract 128 bits (composed of 4 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI32X4'. Intrinsic: '_mm512_maskz_extracti32x4_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtracti64x4Epi64

func M512MaskzExtracti64x4Epi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)

M512MaskzExtracti64x4Epi64: Extract 256 bits (composed of 4 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTI64X4'. Intrinsic: '_mm512_maskz_extracti64x4_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzFixupimmPd

func M512MaskzFixupimmPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte) (dst x86.M512d)

M512MaskzFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_maskz_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzFixupimmPs

func M512MaskzFixupimmPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512i, imm8 byte) (dst x86.M512)

M512MaskzFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_maskz_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzFixupimmRoundPd

func M512MaskzFixupimmRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512i, imm8 byte, rounding int) (dst x86.M512d)

M512MaskzFixupimmRoundPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm512_maskz_fixupimm_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzFixupimmRoundPs

func M512MaskzFixupimmRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512i, imm8 byte, rounding int) (dst x86.M512)

M512MaskzFixupimmRoundPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm512_maskz_fixupimm_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzFmaddPd

func M512MaskzFmaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_maskz_fmadd_pd'. Requires AVX512F.

func M512MaskzFmaddPs

func M512MaskzFmaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_maskz_fmadd_ps'. Requires AVX512F.

func M512MaskzFmaddRoundPd

func M512MaskzFmaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm512_maskz_fmadd_round_pd'. Requires AVX512F.

func M512MaskzFmaddRoundPs

func M512MaskzFmaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'a' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				a[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm512_maskz_fmadd_round_ps'. Requires AVX512F.

func M512MaskzFmaddsubPd

func M512MaskzFmaddsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_maskz_fmaddsub_pd'. Requires AVX512F.

func M512MaskzFmaddsubPs

func M512MaskzFmaddsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_maskz_fmaddsub_ps'. Requires AVX512F.

func M512MaskzFmaddsubRoundPd

func M512MaskzFmaddsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFmaddsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm512_maskz_fmaddsub_round_pd'. Requires AVX512F.

func M512MaskzFmaddsubRoundPs

func M512MaskzFmaddsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFmaddsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm512_maskz_fmaddsub_round_ps'. Requires AVX512F.

func M512MaskzFmsubPd

func M512MaskzFmsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_maskz_fmsub_pd'. Requires AVX512F.

func M512MaskzFmsubPs

func M512MaskzFmsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_maskz_fmsub_ps'. Requires AVX512F.

func M512MaskzFmsubRoundPd

func M512MaskzFmsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm512_maskz_fmsub_round_pd'. Requires AVX512F.

func M512MaskzFmsubRoundPs

func M512MaskzFmsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm512_maskz_fmsub_round_ps'. Requires AVX512F.

func M512MaskzFmsubaddPd

func M512MaskzFmsubaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_maskz_fmsubadd_pd'. Requires AVX512F.

func M512MaskzFmsubaddPs

func M512MaskzFmsubaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_maskz_fmsubadd_ps'. Requires AVX512F.

func M512MaskzFmsubaddRoundPd

func M512MaskzFmsubaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFmsubaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				IF (j is even)
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
				ELSE
					dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
				FI
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm512_maskz_fmsubadd_round_pd'. Requires AVX512F.

func M512MaskzFmsubaddRoundPs

func M512MaskzFmsubaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFmsubaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				IF (j is even)
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
				ELSE
					dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
				FI
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm512_maskz_fmsubadd_round_ps'. Requires AVX512F.

func M512MaskzFnmaddPd

func M512MaskzFnmaddPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_maskz_fnmadd_pd'. Requires AVX512F.

func M512MaskzFnmaddPs

func M512MaskzFnmaddPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_maskz_fnmadd_ps'. Requires AVX512F.

func M512MaskzFnmaddRoundPd

func M512MaskzFnmaddRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFnmaddRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm512_maskz_fnmadd_round_pd'. Requires AVX512F.

func M512MaskzFnmaddRoundPs

func M512MaskzFnmaddRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFnmaddRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm512_maskz_fnmadd_round_ps'. Requires AVX512F.

func M512MaskzFnmsubPd

func M512MaskzFnmsubPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d) (dst x86.M512d)

M512MaskzFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_maskz_fnmsub_pd'. Requires AVX512F.

func M512MaskzFnmsubPs

func M512MaskzFnmsubPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512) (dst x86.M512)

M512MaskzFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_maskz_fnmsub_ps'. Requires AVX512F.

func M512MaskzFnmsubRoundPd

func M512MaskzFnmsubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, c x86.M512d, rounding int) (dst x86.M512d)

M512MaskzFnmsubRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Rounding is done according to the 'rounding' parameter, which can be one of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm512_maskz_fnmsub_round_pd'. Requires AVX512F.

func M512MaskzFnmsubRoundPs

func M512MaskzFnmsubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, c x86.M512, rounding int) (dst x86.M512)

M512MaskzFnmsubRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm512_maskz_fnmsub_round_ps'. Requires AVX512F.

func M512MaskzGetexpPd

func M512MaskzGetexpPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm512_maskz_getexp_pd'. Requires AVX512F.

func M512MaskzGetexpPs

func M512MaskzGetexpPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm512_maskz_getexp_ps'. Requires AVX512F.

func M512MaskzGetexpRoundPd

func M512MaskzGetexpRoundPd(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)

M512MaskzGetexpRoundPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ConvertExpFP64(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm512_maskz_getexp_round_pd'. Requires AVX512F.

func M512MaskzGetexpRoundPs

func M512MaskzGetexpRoundPs(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)

M512MaskzGetexpRoundPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ConvertExpFP32(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm512_maskz_getexp_round_ps'. Requires AVX512F.

func M512MaskzGetmantPd

func M512MaskzGetmantPd(k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512d)

M512MaskzGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm512_maskz_getmant_pd'. Requires AVX512F.

func M512MaskzGetmantPs

func M512MaskzGetmantPs(k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M512)

M512MaskzGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm512_maskz_getmant_ps'. Requires AVX512F.

func M512MaskzGetmantRoundPd

func M512MaskzGetmantRoundPd(k x86.Mmask8, a x86.M512d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512d)

M512MaskzGetmantRoundPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm512_maskz_getmant_round_pd'. Requires AVX512F.

func M512MaskzGetmantRoundPs

func M512MaskzGetmantRoundPs(k x86.Mmask16, a x86.M512, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M512)

M512MaskzGetmantRoundPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm512_maskz_getmant_round_ps'. Requires AVX512F.

func M512MaskzInsertf32x4

func M512MaskzInsertf32x4(k x86.Mmask16, a x86.M512, b x86.M128, imm8 byte) (dst x86.M512)

M512MaskzInsertf32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF32X4'. Intrinsic: '_mm512_maskz_insertf32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzInsertf64x4

func M512MaskzInsertf64x4(k x86.Mmask8, a x86.M512d, b x86.M256d, imm8 byte) (dst x86.M512d)

M512MaskzInsertf64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF64X4'. Intrinsic: '_mm512_maskz_insertf64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzInserti32x4

func M512MaskzInserti32x4(k x86.Mmask16, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512MaskzInserti32x4: Copy 'a' to 'tmp', then insert 128 bits (composed of 4 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI32X4'. Intrinsic: '_mm512_maskz_inserti32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzInserti64x4

func M512MaskzInserti64x4(k x86.Mmask8, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512MaskzInserti64x4: Copy 'a' to 'tmp', then insert 256 bits (composed of 4 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI64X4'. Intrinsic: '_mm512_maskz_inserti64x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzMaxEpi32

func M512MaskzMaxEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm512_maskz_max_epi32'. Requires AVX512F.

func M512MaskzMaxEpi64

func M512MaskzMaxEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm512_maskz_max_epi64'. Requires AVX512F.

func M512MaskzMaxEpu32

func M512MaskzMaxEpu32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm512_maskz_max_epu32'. Requires AVX512F.

func M512MaskzMaxEpu64

func M512MaskzMaxEpu64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm512_maskz_max_epu64'. Requires AVX512F.

func M512MaskzMaxPd

func M512MaskzMaxPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_maskz_max_pd'. Requires AVX512F.

func M512MaskzMaxPs

func M512MaskzMaxPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_maskz_max_ps'. Requires AVX512F.

func M512MaskzMaxRoundPd

func M512MaskzMaxRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MaskzMaxRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
		ELSE
			dst[i+63:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_maskz_max_round_pd'. Requires AVX512F.

func M512MaskzMaxRoundPs

func M512MaskzMaxRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MaskzMaxRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		IF k[j]
			dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_maskz_max_round_ps'. Requires AVX512F.

func M512MaskzMinEpi32

func M512MaskzMinEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm512_maskz_min_epi32'. Requires AVX512F.

func M512MaskzMinEpi64

func M512MaskzMinEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm512_maskz_min_epi64'. Requires AVX512F.

func M512MaskzMinEpu32

func M512MaskzMinEpu32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm512_maskz_min_epu32'. Requires AVX512F.

func M512MaskzMinEpu64

func M512MaskzMinEpu64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm512_maskz_min_epu64'. Requires AVX512F.

func M512MaskzMinPd

func M512MaskzMinPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_maskz_min_pd'. Requires AVX512F.

func M512MaskzMinPs

func M512MaskzMinPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_maskz_min_ps'. Requires AVX512F.

func M512MaskzMinRoundPd

func M512MaskzMinRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MaskzMinRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
		ELSE
			dst[i+63:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_maskz_min_round_pd'. Requires AVX512F.

func M512MaskzMinRoundPs

func M512MaskzMinRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MaskzMinRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		IF k[j]
			dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
		ELSE
			dst[i+31:i] := 0
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_maskz_min_round_ps'. Requires AVX512F.

func M512MaskzMovEpi32

func M512MaskzMovEpi32(k x86.Mmask16, a x86.M512i) (dst x86.M512i)

M512MaskzMovEpi32: Move packed 32-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm512_maskz_mov_epi32'. Requires AVX512F.

func M512MaskzMovEpi64

func M512MaskzMovEpi64(k x86.Mmask8, a x86.M512i) (dst x86.M512i)

M512MaskzMovEpi64: Move packed 64-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm512_maskz_mov_epi64'. Requires AVX512F.

func M512MaskzMovPd

func M512MaskzMovPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm512_maskz_mov_pd'. Requires AVX512F.

func M512MaskzMovPs

func M512MaskzMovPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm512_maskz_mov_ps'. Requires AVX512F.

func M512MaskzMovedupPd

func M512MaskzMovedupPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
tmp[319:256] := a[319:256]
tmp[383:320] := a[319:256]
tmp[447:384] := a[447:384]
tmp[511:448] := a[447:384]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm512_maskz_movedup_pd'. Requires AVX512F.

func M512MaskzMovehdupPs

func M512MaskzMovehdupPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
tmp[287:256] := a[319:288]
tmp[319:288] := a[319:288]
tmp[351:320] := a[383:352]
tmp[383:352] := a[383:352]
tmp[415:384] := a[447:416]
tmp[447:416] := a[447:416]
tmp[479:448] := a[511:480]
tmp[511:480] := a[511:480]
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm512_maskz_movehdup_ps'. Requires AVX512F.

func M512MaskzMoveldupPs

func M512MaskzMoveldupPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
tmp[287:256] := a[287:256]
tmp[319:288] := a[287:256]
tmp[351:320] := a[351:320]
tmp[383:352] := a[351:320]
tmp[415:384] := a[415:384]
tmp[447:416] := a[415:384]
tmp[479:448] := a[479:448]
tmp[511:480] := a[479:448]
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm512_maskz_moveldup_ps'. Requires AVX512F.

func M512MaskzMulEpi32

func M512MaskzMulEpi32(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm512_maskz_mul_epi32'. Requires AVX512F.

func M512MaskzMulEpu32

func M512MaskzMulEpu32(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm512_maskz_mul_epu32'. Requires AVX512F.

func M512MaskzMulPd

func M512MaskzMulPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm512_maskz_mul_pd'. Requires AVX512F.

func M512MaskzMulPs

func M512MaskzMulPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm512_maskz_mul_ps'. Requires AVX512F.

func M512MaskzMulRoundPd

func M512MaskzMulRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskzMulRoundPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := a[i+63:i] * b[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm512_maskz_mul_round_pd'. Requires AVX512F.

func M512MaskzMulRoundPs

func M512MaskzMulRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskzMulRoundPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := a[i+31:i] * b[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm512_maskz_mul_round_ps'. Requires AVX512F.

func M512MaskzMulloEpi32

func M512MaskzMulloEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm512_maskz_mullo_epi32'. Requires AVX512F.

func M512MaskzOrEpi32

func M512MaskzOrEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPORD'. Intrinsic: '_mm512_maskz_or_epi32'. Requires AVX512F.

func M512MaskzOrEpi64

func M512MaskzOrEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm512_maskz_or_epi64'. Requires AVX512F.

func M512MaskzPermutePd

func M512MaskzPermutePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]
IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]
IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]
IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]
IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]
IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]
IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]
IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_maskz_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzPermutePs

func M512MaskzPermutePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskzPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_maskz_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzPermutevarPd

func M512MaskzPermutevarPd(k x86.Mmask8, a x86.M512d, b x86.M512i) (dst x86.M512d)

M512MaskzPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
IF (b[257] == 0) tmp_dst[319:256] := a[319:256]
IF (b[257] == 1) tmp_dst[319:256] := a[383:320]
IF (b[321] == 0) tmp_dst[383:320] := a[319:256]
IF (b[321] == 1) tmp_dst[383:320] := a[383:320]
IF (b[385] == 0) tmp_dst[447:384] := a[447:384]
IF (b[385] == 1) tmp_dst[447:384] := a[511:448]
IF (b[449] == 0) tmp_dst[511:448] := a[447:384]
IF (b[449] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_maskz_permutevar_pd'. Requires AVX512F.

func M512MaskzPermutevarPs

func M512MaskzPermutevarPs(k x86.Mmask16, a x86.M512, b x86.M512i) (dst x86.M512)

M512MaskzPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_maskz_permutevar_ps'. Requires AVX512F.

func M512MaskzPermutex2varEpi32

func M512MaskzPermutex2varEpi32(k x86.Mmask16, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm512_maskz_permutex2var_epi32'. Requires AVX512F.

func M512MaskzPermutex2varEpi64

func M512MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm512_maskz_permutex2var_epi64'. Requires AVX512F.

func M512MaskzPermutex2varPd

func M512MaskzPermutex2varPd(k x86.Mmask8, a x86.M512d, idx x86.M512i, b x86.M512d) (dst x86.M512d)

M512MaskzPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm512_maskz_permutex2var_pd'. Requires AVX512F.

func M512MaskzPermutex2varPs

func M512MaskzPermutex2varPs(k x86.Mmask16, a x86.M512, idx x86.M512i, b x86.M512) (dst x86.M512)

M512MaskzPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm512_maskz_permutex2var_ps'. Requires AVX512F.

func M512MaskzPermutexEpi64

func M512MaskzPermutexEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzPermutexEpi64: Shuffle 64-bit integers in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_maskz_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzPermutexPd

func M512MaskzPermutexPd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzPermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_maskz_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzPermutexvarEpi32

func M512MaskzPermutexvarEpi32(k x86.Mmask16, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskzPermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm512_maskz_permutexvar_epi32'. Requires AVX512F.

func M512MaskzPermutexvarEpi64

func M512MaskzPermutexvarEpi64(k x86.Mmask8, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskzPermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_maskz_permutexvar_epi64'. Requires AVX512F.

func M512MaskzPermutexvarPd

func M512MaskzPermutexvarPd(k x86.Mmask8, idx x86.M512i, a x86.M512d) (dst x86.M512d)

M512MaskzPermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	IF k[j]
		dst[i+63:i] := a[id+63:id]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_maskz_permutexvar_pd'. Requires AVX512F.

func M512MaskzPermutexvarPs

func M512MaskzPermutexvarPs(k x86.Mmask16, idx x86.M512i, a x86.M512) (dst x86.M512)

M512MaskzPermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	IF k[j]
		dst[i+31:i] := a[id+31:id]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm512_maskz_permutexvar_ps'. Requires AVX512F.

func M512MaskzRcp14Pd

func M512MaskzRcp14Pd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm512_maskz_rcp14_pd'. Requires AVX512F.

func M512MaskzRcp14Ps

func M512MaskzRcp14Ps(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm512_maskz_rcp14_ps'. Requires AVX512F.

func M512MaskzRolEpi32

func M512MaskzRolEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm512_maskz_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRolEpi64

func M512MaskzRolEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm512_maskz_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRolvEpi32

func M512MaskzRolvEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm512_maskz_rolv_epi32'. Requires AVX512F.

func M512MaskzRolvEpi64

func M512MaskzRolvEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm512_maskz_rolv_epi64'. Requires AVX512F.

func M512MaskzRorEpi32

func M512MaskzRorEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm512_maskz_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRorEpi64

func M512MaskzRorEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm512_maskz_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRorvEpi32

func M512MaskzRorvEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm512_maskz_rorv_epi32'. Requires AVX512F.

func M512MaskzRorvEpi64

func M512MaskzRorvEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm512_maskz_rorv_epi64'. Requires AVX512F.

func M512MaskzRoundscalePd

func M512MaskzRoundscalePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_maskz_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRoundscalePs

func M512MaskzRoundscalePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskzRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_maskz_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRoundscaleRoundPd

func M512MaskzRoundscaleRoundPd(k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskzRoundscaleRoundPd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_maskz_roundscale_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRoundscaleRoundPs

func M512MaskzRoundscaleRoundPs(k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskzRoundscaleRoundPs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_maskz_roundscale_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzRsqrt14Pd

func M512MaskzRsqrt14Pd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm512_maskz_rsqrt14_pd'. Requires AVX512F.

func M512MaskzRsqrt14Ps

func M512MaskzRsqrt14Ps(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm512_maskz_rsqrt14_ps'. Requires AVX512F.

func M512MaskzScalefPd

func M512MaskzScalefPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_maskz_scalef_pd'. Requires AVX512F.

func M512MaskzScalefPs

func M512MaskzScalefPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_maskz_scalef_ps'. Requires AVX512F.

func M512MaskzScalefRoundPd

func M512MaskzScalefRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskzScalefRoundPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_maskz_scalef_round_pd'. Requires AVX512F.

func M512MaskzScalefRoundPs

func M512MaskzScalefRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskzScalefRoundPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_maskz_scalef_round_ps'. Requires AVX512F.

func M512MaskzSet1Epi32

func M512MaskzSet1Epi32(k x86.Mmask16, a int) (dst x86.M512i)

M512MaskzSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_maskz_set1_epi32'. Requires AVX512F.

func M512MaskzSet1Epi64

func M512MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M512i)

M512MaskzSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_maskz_set1_epi64'. Requires AVX512F.

func M512MaskzShuffleEpi32

func M512MaskzShuffleEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm512_maskz_shuffle_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShuffleF32x4

func M512MaskzShuffleF32x4(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskzShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm512_maskz_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShuffleF64x2

func M512MaskzShuffleF64x2(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm512_maskz_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShuffleI32x4

func M512MaskzShuffleI32x4(k x86.Mmask16, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm512_maskz_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShuffleI64x2

func M512MaskzShuffleI64x2(k x86.Mmask8, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm512_maskz_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShufflePd

func M512MaskzShufflePd(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm512_maskz_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzShufflePs

func M512MaskzShufflePs(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskzShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm512_maskz_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSllEpi32

func M512MaskzSllEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm512_maskz_sll_epi32'. Requires AVX512F.

func M512MaskzSllEpi64

func M512MaskzSllEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_maskz_sll_epi64'. Requires AVX512F.

func M512MaskzSlliEpi32

func M512MaskzSlliEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm512_maskz_slli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSlliEpi64

func M512MaskzSlliEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_maskz_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSllvEpi32

func M512MaskzSllvEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm512_maskz_sllv_epi32'. Requires AVX512F.

func M512MaskzSllvEpi64

func M512MaskzSllvEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm512_maskz_sllv_epi64'. Requires AVX512F.

func M512MaskzSqrtPd

func M512MaskzSqrtPd(k x86.Mmask8, a x86.M512d) (dst x86.M512d)

M512MaskzSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_maskz_sqrt_pd'. Requires AVX512F.

func M512MaskzSqrtPs

func M512MaskzSqrtPs(k x86.Mmask16, a x86.M512) (dst x86.M512)

M512MaskzSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_maskz_sqrt_ps'. Requires AVX512F.

func M512MaskzSqrtRoundPd

func M512MaskzSqrtRoundPd(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512d)

M512MaskzSqrtRoundPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE.

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := SQRT(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_maskz_sqrt_round_pd'. Requires AVX512F.

func M512MaskzSqrtRoundPs

func M512MaskzSqrtRoundPs(k x86.Mmask16, a x86.M512, rounding int) (dst x86.M512)

M512MaskzSqrtRoundPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := SQRT(a[i+31:i])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_maskz_sqrt_round_ps'. Requires AVX512F.

func M512MaskzSraEpi32

func M512MaskzSraEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm512_maskz_sra_epi32'. Requires AVX512F.

func M512MaskzSraEpi64

func M512MaskzSraEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_maskz_sra_epi64'. Requires AVX512F.

func M512MaskzSraiEpi32

func M512MaskzSraiEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm512_maskz_srai_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSraiEpi64

func M512MaskzSraiEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_maskz_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSravEpi32

func M512MaskzSravEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm512_maskz_srav_epi32'. Requires AVX512F.

func M512MaskzSravEpi64

func M512MaskzSravEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm512_maskz_srav_epi64'. Requires AVX512F.

func M512MaskzSrlEpi32

func M512MaskzSrlEpi32(k x86.Mmask16, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm512_maskz_srl_epi32'. Requires AVX512F.

func M512MaskzSrlEpi64

func M512MaskzSrlEpi64(k x86.Mmask8, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_maskz_srl_epi64'. Requires AVX512F.

func M512MaskzSrliEpi32

func M512MaskzSrliEpi32(k x86.Mmask16, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm512_maskz_srli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSrliEpi64

func M512MaskzSrliEpi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_maskz_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzSrlvEpi32

func M512MaskzSrlvEpi32(k x86.Mmask16, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm512_maskz_srlv_epi32'. Requires AVX512F.

func M512MaskzSrlvEpi64

func M512MaskzSrlvEpi64(k x86.Mmask8, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm512_maskz_srlv_epi64'. Requires AVX512F.

func M512MaskzSubEpi32

func M512MaskzSubEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm512_maskz_sub_epi32'. Requires AVX512F.

func M512MaskzSubEpi64

func M512MaskzSubEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm512_maskz_sub_epi64'. Requires AVX512F.

func M512MaskzSubPd

func M512MaskzSubPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm512_maskz_sub_pd'. Requires AVX512F.

func M512MaskzSubPs

func M512MaskzSubPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm512_maskz_sub_ps'. Requires AVX512F.

func M512MaskzSubRoundPd

func M512MaskzSubRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512MaskzSubRoundPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := a[i+63:i] - b[i+63:i]
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm512_maskz_sub_round_pd'. Requires AVX512F.

func M512MaskzSubRoundPs

func M512MaskzSubRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512MaskzSubRoundPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := a[i+31:i] - b[i+31:i]
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm512_maskz_sub_round_ps'. Requires AVX512F.

func M512MaskzTernarylogicEpi32

func M512MaskzTernarylogicEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm512_maskz_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzTernarylogicEpi64

func M512MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm512_maskz_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512MaskzUnpackhiEpi32

func M512MaskzUnpackhiEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm512_maskz_unpackhi_epi32'. Requires AVX512F.

func M512MaskzUnpackhiEpi64

func M512MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm512_maskz_unpackhi_epi64'. Requires AVX512F.

func M512MaskzUnpackhiPd

func M512MaskzUnpackhiPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm512_maskz_unpackhi_pd'. Requires AVX512F.

func M512MaskzUnpackhiPs

func M512MaskzUnpackhiPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm512_maskz_unpackhi_ps'. Requires AVX512F.

func M512MaskzUnpackloEpi32

func M512MaskzUnpackloEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm512_maskz_unpacklo_epi32'. Requires AVX512F.

func M512MaskzUnpackloEpi64

func M512MaskzUnpackloEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm512_maskz_unpacklo_epi64'. Requires AVX512F.

func M512MaskzUnpackloPd

func M512MaskzUnpackloPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm512_maskz_unpacklo_pd'. Requires AVX512F.

func M512MaskzUnpackloPs

func M512MaskzUnpackloPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm512_maskz_unpacklo_ps'. Requires AVX512F.

func M512MaskzXorEpi32

func M512MaskzXorEpi32(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm512_maskz_xor_epi32'. Requires AVX512F.

func M512MaskzXorEpi64

func M512MaskzXorEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_maskz_xor_epi64'. Requires AVX512F.

func M512MaxEpi64

func M512MaxEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm512_max_epi64'. Requires AVX512F.

func M512MaxEpu64

func M512MaxEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm512_max_epu64'. Requires AVX512F.

func M512MaxPd

func M512MaxPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_max_pd'. Requires AVX512F.

func M512MaxPs

func M512MaxPs(a x86.M512, b x86.M512) (dst x86.M512)

M512MaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_max_ps'. Requires AVX512F.

func M512MaxRoundPd

func M512MaxRoundPd(a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MaxRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm512_max_round_pd'. Requires AVX512F.

func M512MaxRoundPs

func M512MaxRoundPs(a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MaxRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm512_max_round_ps'. Requires AVX512F.

func M512MinEpi64

func M512MinEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm512_min_epi64'. Requires AVX512F.

func M512MinEpu64

func M512MinEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm512_min_epu64'. Requires AVX512F.

func M512MinPd

func M512MinPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_min_pd'. Requires AVX512F.

func M512MinPs

func M512MinPs(a x86.M512, b x86.M512) (dst x86.M512)

M512MinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_min_ps'. Requires AVX512F.

func M512MinRoundPd

func M512MinRoundPd(a x86.M512d, b x86.M512d, sae int) (dst x86.M512d)

M512MinRoundPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm512_min_round_pd'. Requires AVX512F.

func M512MinRoundPs

func M512MinRoundPs(a x86.M512, b x86.M512, sae int) (dst x86.M512)

M512MinRoundPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 15
		i := j*32
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm512_min_round_ps'. Requires AVX512F.

func M512MovedupPd

func M512MovedupPd(a x86.M512d) (dst x86.M512d)

M512MovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst'.

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
tmp[319:256] := a[319:256]
tmp[383:320] := a[319:256]
tmp[447:384] := a[447:384]
tmp[511:448] := a[447:384]
dst[MAX:512] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm512_movedup_pd'. Requires AVX512F.

func M512MovehdupPs

func M512MovehdupPs(a x86.M512) (dst x86.M512)

M512MovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst'.

dst[31:0] := a[63:32]
dst[63:32] := a[63:32]
dst[95:64] := a[127:96]
dst[127:96] := a[127:96]
dst[159:128] := a[191:160]
dst[191:160] := a[191:160]
dst[223:192] := a[255:224]
dst[255:224] := a[255:224]
dst[287:256] := a[319:288]
dst[319:288] := a[319:288]
dst[351:320] := a[383:352]
dst[383:352] := a[383:352]
dst[415:384] := a[447:416]
dst[447:416] := a[447:416]
dst[479:448] := a[511:480]
dst[511:480] := a[511:480]
dst[MAX:512] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm512_movehdup_ps'. Requires AVX512F.

func M512MoveldupPs

func M512MoveldupPs(a x86.M512) (dst x86.M512)

M512MoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst'.

dst[31:0] := a[31:0]
dst[63:32] := a[31:0]
dst[95:64] := a[95:64]
dst[127:96] := a[95:64]
dst[159:128] := a[159:128]
dst[191:160] := a[159:128]
dst[223:192] := a[223:192]
dst[255:224] := a[223:192]
dst[287:256] := a[287:256]
dst[319:288] := a[287:256]
dst[351:320] := a[351:320]
dst[383:352] := a[351:320]
dst[415:384] := a[415:384]
dst[447:416] := a[415:384]
dst[479:448] := a[479:448]
dst[511:480] := a[479:448]
dst[MAX:512] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm512_moveldup_ps'. Requires AVX512F.

func M512MulEpi32

func M512MulEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm512_mul_epi32'. Requires AVX512F.

func M512MulEpu32

func M512MulEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm512_mul_epu32'. Requires AVX512F.

func M512MulloxEpi64

func M512MulloxEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulloxEpi64: Multiplies elements in packed 64-bit integer vectors 'a' and 'b' together, storing the lower 64 bits of the result in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] * b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_mullox_epi64'. Requires AVX512F.

func M512NearbyintPd

func M512NearbyintPd(a x86.M512d) (dst x86.M512d)

M512NearbyintPd: Rounds each packed double-precision (64-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := NearbyInt(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_nearbyint_pd'. Requires AVX512F.

func M512NearbyintPs

func M512NearbyintPs(a x86.M512) (dst x86.M512)

M512NearbyintPs: Rounds each packed single-precision (32-bit) floating-point element in 'a' to the nearest integer value and stores the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := NearbyInt(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_nearbyint_ps'. Requires AVX512F.

func M512PermutePd

func M512PermutePd(a x86.M512d, imm8 byte) (dst x86.M512d)

M512PermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.

IF (imm8[0] == 0) dst[63:0] := a[63:0]
IF (imm8[0] == 1) dst[63:0] := a[127:64]
IF (imm8[1] == 0) dst[127:64] := a[63:0]
IF (imm8[1] == 1) dst[127:64] := a[127:64]
IF (imm8[2] == 0) dst[191:128] := a[191:128]
IF (imm8[2] == 1) dst[191:128] := a[255:192]
IF (imm8[3] == 0) dst[255:192] := a[191:128]
IF (imm8[3] == 1) dst[255:192] := a[255:192]
IF (imm8[4] == 0) dst[319:256] := a[319:256]
IF (imm8[4] == 1) dst[319:256] := a[383:320]
IF (imm8[5] == 0) dst[383:320] := a[319:256]
IF (imm8[5] == 1) dst[383:320] := a[383:320]
IF (imm8[6] == 0) dst[447:384] := a[447:384]
IF (imm8[6] == 1) dst[447:384] := a[511:448]
IF (imm8[7] == 0) dst[511:448] := a[447:384]
IF (imm8[7] == 1) dst[511:448] := a[511:448]
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512PermutePs

func M512PermutePs(a x86.M512, imm8 byte) (dst x86.M512)

M512PermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(a[255:128], imm8[5:4])
dst[255:224] := SELECT4(a[255:128], imm8[7:6])
dst[287:256] := SELECT4(a[383:256], imm8[1:0])
dst[319:288] := SELECT4(a[383:256], imm8[3:2])
dst[351:320] := SELECT4(a[383:256], imm8[5:4])
dst[383:352] := SELECT4(a[383:256], imm8[7:6])
dst[415:384] := SELECT4(a[511:384], imm8[1:0])
dst[447:416] := SELECT4(a[511:384], imm8[3:2])
dst[479:448] := SELECT4(a[511:384], imm8[5:4])
dst[511:480] := SELECT4(a[511:384], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512PermutevarPd

func M512PermutevarPd(a x86.M512d, b x86.M512i) (dst x86.M512d)

M512PermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst'.

IF (b[1] == 0) dst[63:0] := a[63:0]
IF (b[1] == 1) dst[63:0] := a[127:64]
IF (b[65] == 0) dst[127:64] := a[63:0]
IF (b[65] == 1) dst[127:64] := a[127:64]
IF (b[129] == 0) dst[191:128] := a[191:128]
IF (b[129] == 1) dst[191:128] := a[255:192]
IF (b[193] == 0) dst[255:192] := a[191:128]
IF (b[193] == 1) dst[255:192] := a[255:192]
IF (b[257] == 0) dst[319:256] := a[319:256]
IF (b[257] == 1) dst[319:256] := a[383:320]
IF (b[321] == 0) dst[383:320] := a[319:256]
IF (b[321] == 1) dst[383:320] := a[383:320]
IF (b[385] == 0) dst[447:384] := a[447:384]
IF (b[385] == 1) dst[447:384] := a[511:448]
IF (b[449] == 0) dst[511:448] := a[447:384]
IF (b[449] == 1) dst[511:448] := a[511:448]
dst[MAX:512] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm512_permutevar_pd'. Requires AVX512F.

func M512PermutevarPs

func M512PermutevarPs(a x86.M512, b x86.M512i) (dst x86.M512)

M512PermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], b[1:0])
dst[63:32] := SELECT4(a[127:0], b[33:32])
dst[95:64] := SELECT4(a[127:0], b[65:64])
dst[127:96] := SELECT4(a[127:0], b[97:96])
dst[159:128] := SELECT4(a[255:128], b[129:128])
dst[191:160] := SELECT4(a[255:128], b[161:160])
dst[223:192] := SELECT4(a[255:128], b[193:192])
dst[255:224] := SELECT4(a[255:128], b[225:224])
dst[287:256] := SELECT4(a[383:256], b[257:256])
dst[319:288] := SELECT4(a[383:256], b[289:288])
dst[351:320] := SELECT4(a[383:256], b[321:320])
dst[383:352] := SELECT4(a[383:256], b[353:352])
dst[415:384] := SELECT4(a[511:384], b[385:384])
dst[447:416] := SELECT4(a[511:384], b[417:416])
dst[479:448] := SELECT4(a[511:384], b[449:448])
dst[511:480] := SELECT4(a[511:384], b[481:480])
dst[MAX:512] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm512_permutevar_ps'. Requires AVX512F.

func M512Permutex2varEpi32

func M512Permutex2varEpi32(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm512_permutex2var_epi32'. Requires AVX512F.

func M512Permutex2varEpi64

func M512Permutex2varEpi64(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm512_permutex2var_epi64'. Requires AVX512F.

func M512Permutex2varPd

func M512Permutex2varPd(a x86.M512d, idx x86.M512i, b x86.M512d) (dst x86.M512d)

M512Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	off := idx[i+2:i]*64
	dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm512_permutex2var_pd'. Requires AVX512F.

func M512Permutex2varPs

func M512Permutex2varPs(a x86.M512, idx x86.M512i, b x86.M512) (dst x86.M512)

M512Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	off := idx[i+3:i]*32
	dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm512_permutex2var_ps'. Requires AVX512F.

func M512PermutexEpi64

func M512PermutexEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512PermutexEpi64: Shuffle 64-bit integers in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[319:256] := SELECT4(a[511:256], imm8[1:0])
dst[383:320] := SELECT4(a[511:256], imm8[3:2])
dst[447:384] := SELECT4(a[511:256], imm8[5:4])
dst[511:448] := SELECT4(a[511:256], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_permutex_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512PermutexPd

func M512PermutexPd(a x86.M512d, imm8 byte) (dst x86.M512d)

M512PermutexPd: Shuffle double-precision (64-bit) floating-point elements in 'a' within 256-bit lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[319:256] := SELECT4(a[511:256], imm8[1:0])
dst[383:320] := SELECT4(a[511:256], imm8[3:2])
dst[447:384] := SELECT4(a[511:256], imm8[5:4])
dst[511:448] := SELECT4(a[511:256], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_permutex_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512PermutexvarEpi32

func M512PermutexvarEpi32(idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512PermutexvarEpi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm512_permutexvar_epi32'. Requires AVX512F.

func M512PermutexvarEpi64

func M512PermutexvarEpi64(idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512PermutexvarEpi64: Shuffle 64-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm512_permutexvar_epi64'. Requires AVX512F.

func M512PermutexvarPd

func M512PermutexvarPd(idx x86.M512i, a x86.M512d) (dst x86.M512d)

M512PermutexvarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	id := idx[i+2:i]*64
	dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm512_permutexvar_pd'. Requires AVX512F.

func M512PermutexvarPs

func M512PermutexvarPs(idx x86.M512i, a x86.M512) (dst x86.M512)

M512PermutexvarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx'.

FOR j := 0 to 15
	i := j*32
	id := idx[i+3:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm512_permutexvar_ps'. Requires AVX512F.

func M512PowPd

func M512PowPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512PowPd: Compute the exponential value of packed double-precision (64-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_pow_pd'. Requires AVX512F.

func M512PowPs

func M512PowPs(a x86.M512, b x86.M512) (dst x86.M512)

M512PowPs: Compute the exponential value of packed single-precision (32-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_pow_ps'. Requires AVX512F.

func M512Rcp14Pd

func M512Rcp14Pd(a x86.M512d) (dst x86.M512d)

M512Rcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm512_rcp14_pd'. Requires AVX512F.

func M512Rcp14Ps

func M512Rcp14Ps(a x86.M512) (dst x86.M512)

M512Rcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm512_rcp14_ps'. Requires AVX512F.

func M512RecipPd

func M512RecipPd(a x86.M512d) (dst x86.M512d)

M512RecipPd: Computes the reciprocal of packed double-precision (64-bit) floating-point elements in 'a', storing the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := (1 / a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_recip_pd'. Requires AVX512F.

func M512RecipPs

func M512RecipPs(a x86.M512) (dst x86.M512)

M512RecipPs: Computes the reciprocal of packed single-precision (32-bit) floating-point elements in 'a', storing the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (1 / a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_recip_ps'. Requires AVX512F.

func M512RemEpi16

func M512RemEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpi16: Divide packed 16-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 31
	i := 16*j
	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epi16'. Requires AVX512F.

func M512RemEpi32

func M512RemEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epi32'. Requires AVX512F.

func M512RemEpi64

func M512RemEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpi64: Divide packed 64-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 7
	i := 64*j
	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epi64'. Requires AVX512F.

func M512RemEpi8

func M512RemEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpi8: Divide packed 8-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 63
	i := 8*j
	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epi8'. Requires AVX512F.

func M512RemEpu16

func M512RemEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpu16: Divide packed unsigned 16-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 31
	i := 16*j
	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epu16'. Requires AVX512F.

func M512RemEpu32

func M512RemEpu32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 15
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epu32'. Requires AVX512F.

func M512RemEpu64

func M512RemEpu64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpu64: Divide packed unsigned 64-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 7
	i := 64*j
	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epu64'. Requires AVX512F.

func M512RemEpu8

func M512RemEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RemEpu8: Divide packed unsigned 8-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 63
	i := 8*j
	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rem_epu8'. Requires AVX512F.

func M512RintPd

func M512RintPd(a x86.M512d) (dst x86.M512d)

M512RintPd: Rounds the packed double-precision (64-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := RoundToNearestEven(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rint_pd'. Requires AVX512F.

func M512RintPs

func M512RintPs(a x86.M512) (dst x86.M512)

M512RintPs: Rounds the packed single-precision (32-bit) floating-point elements in 'a' to the nearest even integer value and stores the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := RoundToNearestEven(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_rint_ps'. Requires AVX512F.

func M512RolEpi32

func M512RolEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)

M512RolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm512_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RolEpi64

func M512RolEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512RolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm512_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RolvEpi32

func M512RolvEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm512_rolv_epi32'. Requires AVX512F.

func M512RolvEpi64

func M512RolvEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm512_rolv_epi64'. Requires AVX512F.

func M512RorEpi32

func M512RorEpi32(a x86.M512i, imm8 byte) (dst x86.M512i)

M512RorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm512_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RorEpi64

func M512RorEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512RorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm512_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RorvEpi32

func M512RorvEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm512_rorv_epi32'. Requires AVX512F.

func M512RorvEpi64

func M512RorvEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512RorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm512_rorv_epi64'. Requires AVX512F.

func M512RoundscalePd

func M512RoundscalePd(a x86.M512d, imm8 byte) (dst x86.M512d)

M512RoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RoundscalePs

func M512RoundscalePs(a x86.M512, imm8 byte) (dst x86.M512)

M512RoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RoundscaleRoundPd

func M512RoundscaleRoundPd(a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512RoundscaleRoundPd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm512_roundscale_round_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512RoundscaleRoundPs

func M512RoundscaleRoundPs(a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512RoundscaleRoundPs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm512_roundscale_round_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512Rsqrt14Pd

func M512Rsqrt14Pd(a x86.M512d) (dst x86.M512d)

M512Rsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm512_rsqrt14_pd'. Requires AVX512F.

func M512Rsqrt14Ps

func M512Rsqrt14Ps(a x86.M512) (dst x86.M512)

M512Rsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm512_rsqrt14_ps'. Requires AVX512F.

func M512ScalefPd

func M512ScalefPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512ScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_scalef_pd'. Requires AVX512F.

func M512ScalefPs

func M512ScalefPs(a x86.M512, b x86.M512) (dst x86.M512)

M512ScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_scalef_ps'. Requires AVX512F.

func M512ScalefRoundPd

func M512ScalefRoundPd(a x86.M512d, b x86.M512d, rounding int) (dst x86.M512d)

M512ScalefRoundPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm512_scalef_round_pd'. Requires AVX512F.

func M512ScalefRoundPs

func M512ScalefRoundPs(a x86.M512, b x86.M512, rounding int) (dst x86.M512)

M512ScalefRoundPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[31:0]
		}

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm512_scalef_round_ps'. Requires AVX512F.

func M512Set1Epi16

func M512Set1Epi16(a int16) (dst x86.M512i)

M512Set1Epi16: Broadcast the low packed 16-bit integer from 'a' to all all elements of 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set1_epi16'. Requires AVX512F.

func M512Set1Epi32

func M512Set1Epi32(a int) (dst x86.M512i)

M512Set1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm512_set1_epi32'. Requires AVX512F.

func M512Set1Epi64

func M512Set1Epi64(a int64) (dst x86.M512i)

M512Set1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm512_set1_epi64'. Requires AVX512F.

func M512Set1Epi8

func M512Set1Epi8(a byte) (dst x86.M512i)

M512Set1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set1_epi8'. Requires AVX512F.

func M512Set1Pd

func M512Set1Pd(a float64) (dst x86.M512d)

M512Set1Pd: Broadcast double-precision (64-bit) floating-point value 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set1_pd'. Requires AVX512F.

func M512Set1Ps

func M512Set1Ps(a float32) (dst x86.M512)

M512Set1Ps: Broadcast single-precision (32-bit) floating-point value 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set1_ps'. Requires AVX512F.

func M512Set4Epi32

func M512Set4Epi32(d int, c int, b int, a int) (dst x86.M512i)

M512Set4Epi32: Set packed 32-bit integers in 'dst' with the repeated 4 element sequence.

dst[31:0] := d
dst[63:32] := c
dst[95:64] := b
dst[127:96] := a
dst[159:128] := d
dst[191:160] := c
dst[223:192] := b
dst[255:224] := a
dst[287:256] := d
dst[319:288] := c
dst[351:320] := b
dst[383:352] := a
dst[415:384] := d
dst[447:416] := c
dst[479:448] := b
dst[511:480] := a
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set4_epi32'. Requires AVX512F.

func M512Set4Epi64

func M512Set4Epi64(d int64, c int64, b int64, a int64) (dst x86.M512i)

M512Set4Epi64: Set packed 64-bit integers in 'dst' with the repeated 4 element sequence.

dst[63:0] := d
dst[127:64] := c
dst[191:128] := b
dst[255:192] := a
dst[319:256] := d
dst[383:320] := c
dst[447:384] := b
dst[511:448] := a
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set4_epi64'. Requires AVX512F.

func M512Set4Pd

func M512Set4Pd(d float64, c float64, b float64, a float64) (dst x86.M512d)

M512Set4Pd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the repeated 4 element sequence.

dst[63:0] := d
dst[127:64] := c
dst[191:128] := b
dst[255:192] := a
dst[319:256] := d
dst[383:320] := c
dst[447:384] := b
dst[511:448] := a
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set4_pd'. Requires AVX512F.

func M512Set4Ps

func M512Set4Ps(d float32, c float32, b float32, a float32) (dst x86.M512)

M512Set4Ps: Set packed single-precision (32-bit) floating-point elements in 'dst' with the repeated 4 element sequence.

dst[31:0] := d
dst[63:32] := c
dst[95:64] := b
dst[127:96] := a
dst[159:128] := d
dst[191:160] := c
dst[223:192] := b
dst[255:224] := a
dst[287:256] := d
dst[319:288] := c
dst[351:320] := b
dst[383:352] := a
dst[415:384] := d
dst[447:416] := c
dst[479:448] := b
dst[511:480] := a
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set4_ps'. Requires AVX512F.

func M512SetEpi32

func M512SetEpi32(e15 int, e14 int, e13 int, e12 int, e11 int, e10 int, e9 int, e8 int, e7 int, e6 int, e5 int, e4 int, e3 int, e2 int, e1 int, e0 int) (dst x86.M512i)

M512SetEpi32: Set packed 32-bit integers in 'dst' with the supplied values.

dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
dst[159:128] := e4
dst[191:160] := e5
dst[223:192] := e6
dst[255:224] := e7
dst[287:256] := e8
dst[319:288] := e9
dst[351:320] := e10
dst[383:352] := e11
dst[415:384] := e12
dst[447:416] := e13
dst[479:448] := e14
dst[511:480] := e15
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set_epi32'. Requires AVX512F.

func M512SetEpi64

func M512SetEpi64(e7 int64, e6 int64, e5 int64, e4 int64, e3 int64, e2 int64, e1 int64, e0 int64) (dst x86.M512i)

M512SetEpi64: Set packed 64-bit integers in 'dst' with the supplied values.

dst[63:0] := e0
dst[127:64] := e1
dst[191:128] := e2
dst[255:192] := e3
dst[319:256] := e4
dst[383:320] := e5
dst[447:384] := e6
dst[511:448] := e7
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set_epi64'. Requires AVX512F.

func M512SetPd

func M512SetPd(e7 float64, e6 float64, e5 float64, e4 float64, e3 float64, e2 float64, e1 float64, e0 float64) (dst x86.M512d)

M512SetPd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the supplied values.

dst[63:0] := e0
dst[127:64] := e1
dst[191:128] := e2
dst[255:192] := e3
dst[319:256] := e4
dst[383:320] := e5
dst[447:384] := e6
dst[511:448] := e7
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set_pd'. Requires AVX512F.

func M512SetPs

func M512SetPs(e15 float32, e14 float32, e13 float32, e12 float32, e11 float32, e10 float32, e9 float32, e8 float32, e7 float32, e6 float32, e5 float32, e4 float32, e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M512)

M512SetPs: Set packed single-precision (32-bit) floating-point elements in 'dst' with the supplied values.

dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
dst[159:128] := e4
dst[191:160] := e5
dst[223:192] := e6
dst[255:224] := e7
dst[287:256] := e8
dst[319:288] := e9
dst[351:320] := e10
dst[383:352] := e11
dst[415:384] := e12
dst[447:416] := e13
dst[479:448] := e14
dst[511:480] := e15
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_set_ps'. Requires AVX512F.

func M512Setr4Epi32

func M512Setr4Epi32(d int, c int, b int, a int) (dst x86.M512i)

M512Setr4Epi32: Set packed 32-bit integers in 'dst' with the repeated 4 element sequence in reverse order.

dst[31:0] := a
dst[63:32] := b
dst[95:64] := c
dst[127:96] := d
dst[159:128] := a
dst[191:160] := b
dst[223:192] := c
dst[255:224] := d
dst[287:256] := a
dst[319:288] := b
dst[351:320] := c
dst[383:352] := d
dst[415:384] := a
dst[447:416] := b
dst[479:448] := c
dst[511:480] := d
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr4_epi32'. Requires AVX512F.

func M512Setr4Epi64

func M512Setr4Epi64(d int64, c int64, b int64, a int64) (dst x86.M512i)

M512Setr4Epi64: Set packed 64-bit integers in 'dst' with the repeated 4 element sequence in reverse order.

dst[63:0] := a
dst[127:64] := b
dst[191:128] := c
dst[255:192] := d
dst[319:256] := a
dst[383:320] := b
dst[447:384] := c
dst[511:448] := d
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr4_epi64'. Requires AVX512F.

func M512Setr4Pd

func M512Setr4Pd(d float64, c float64, b float64, a float64) (dst x86.M512d)

M512Setr4Pd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the repeated 4 element sequence in reverse order.

dst[63:0] := a
dst[127:64] := b
dst[191:128] := c
dst[255:192] := d
dst[319:256] := a
dst[383:320] := b
dst[447:384] := c
dst[511:448] := d
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr4_pd'. Requires AVX512F.

func M512Setr4Ps

func M512Setr4Ps(d float32, c float32, b float32, a float32) (dst x86.M512)

M512Setr4Ps: Set packed single-precision (32-bit) floating-point elements in 'dst' with the repeated 4 element sequence in reverse order.

dst[31:0] := a
dst[63:32] := b
dst[95:64] := c
dst[127:96] := d
dst[159:128] := a
dst[191:160] := b
dst[223:192] := c
dst[255:224] := d
dst[287:256] := a
dst[319:288] := b
dst[351:320] := c
dst[383:352] := d
dst[415:384] := a
dst[447:416] := b
dst[479:448] := c
dst[511:480] := d
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr4_ps'. Requires AVX512F.

func M512SetrEpi32

func M512SetrEpi32(e15 int, e14 int, e13 int, e12 int, e11 int, e10 int, e9 int, e8 int, e7 int, e6 int, e5 int, e4 int, e3 int, e2 int, e1 int, e0 int) (dst x86.M512i)

M512SetrEpi32: Set packed 32-bit integers in 'dst' with the supplied values in reverse order.

dst[31:0] := e15
dst[63:32] := e14
dst[95:64] := e13
dst[127:96] := e12
dst[159:128] := e11
dst[191:160] := e10
dst[223:192] := e9
dst[255:224] := e8
dst[287:256] := e7
dst[319:288] := e6
dst[351:320] := e5
dst[383:352] := e4
dst[415:384] := e3
dst[447:416] := e2
dst[479:448] := e1
dst[511:480] := e0
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr_epi32'. Requires AVX512F.

func M512SetrEpi64

func M512SetrEpi64(e7 int64, e6 int64, e5 int64, e4 int64, e3 int64, e2 int64, e1 int64, e0 int64) (dst x86.M512i)

M512SetrEpi64: Set packed 64-bit integers in 'dst' with the supplied values in reverse order.

dst[63:0] := e7
dst[127:64] := e6
dst[191:128] := e5
dst[255:192] := e4
dst[319:256] := e3
dst[383:320] := e2
dst[447:384] := e1
dst[511:448] := e0
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr_epi64'. Requires AVX512F.

func M512SetrPd

func M512SetrPd(e7 float64, e6 float64, e5 float64, e4 float64, e3 float64, e2 float64, e1 float64, e0 float64) (dst x86.M512d)

M512SetrPd: Set packed double-precision (64-bit) floating-point elements in 'dst' with the supplied values in reverse order.

dst[63:0] := e7
dst[127:64] := e6
dst[191:128] := e5
dst[255:192] := e4
dst[319:256] := e3
dst[383:320] := e2
dst[447:384] := e1
dst[511:448] := e0
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr_pd'. Requires AVX512F.

func M512SetrPs

func M512SetrPs(e15 float32, e14 float32, e13 float32, e12 float32, e11 float32, e10 float32, e9 float32, e8 float32, e7 float32, e6 float32, e5 float32, e4 float32, e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M512)

M512SetrPs: Set packed single-precision (32-bit) floating-point elements in 'dst' with the supplied values in reverse order.

dst[31:0] := e15
dst[63:32] := e14
dst[95:64] := e13
dst[127:96] := e12
dst[159:128] := e11
dst[191:160] := e10
dst[223:192] := e9
dst[255:224] := e8
dst[287:256] := e7
dst[319:288] := e6
dst[351:320] := e5
dst[383:352] := e4
dst[415:384] := e3
dst[447:416] := e2
dst[479:448] := e1
dst[511:480] := e0
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_setr_ps'. Requires AVX512F.

func M512Setzero

func M512Setzero() (dst x86.M512)

M512Setzero: Return vector of type __m512 with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero'. Requires AVX512F.

func M512SetzeroEpi32

func M512SetzeroEpi32() (dst x86.M512i)

M512SetzeroEpi32: Return vector of type __m512i with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_epi32'. Requires AVX512F.

func M512SetzeroPd

func M512SetzeroPd() (dst x86.M512d)

M512SetzeroPd: Return vector of type __m512d with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_pd'. Requires AVX512F.

func M512SetzeroPs

func M512SetzeroPs() (dst x86.M512)

M512SetzeroPs: Return vector of type __m512 with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_ps'. Requires AVX512F.

func M512SetzeroSi512

func M512SetzeroSi512() (dst x86.M512i)

M512SetzeroSi512: Return vector of type __m512i with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm512_setzero_si512'. Requires AVX512F.

func M512ShuffleF32x4

func M512ShuffleF32x4(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512ShuffleF32x4: Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VSHUFF32X4'. Intrinsic: '_mm512_shuffle_f32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512ShuffleF64x2

func M512ShuffleF64x2(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512ShuffleF64x2: Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VSHUFF64X2'. Intrinsic: '_mm512_shuffle_f64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512ShuffleI32x4

func M512ShuffleI32x4(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512ShuffleI32x4: Shuffle 128-bits (composed of 4 32-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VSHUFI32X4'. Intrinsic: '_mm512_shuffle_i32x4'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512ShuffleI64x2

func M512ShuffleI64x2(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512ShuffleI64x2: Shuffle 128-bits (composed of 2 64-bit integers) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src[127:0]
	1:	tmp[127:0] := src[255:128]
	2:	tmp[127:0] := src[383:256]
	3:	tmp[127:0] := src[511:384]
	ESAC
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VSHUFI64X2'. Intrinsic: '_mm512_shuffle_i64x2'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512ShufflePd

func M512ShufflePd(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512ShufflePd: Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.

dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
dst[MAX:512] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm512_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512ShufflePs

func M512ShufflePs(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512ShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(b[127:0], imm8[5:4])
dst[127:96] := SELECT4(b[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(b[255:128], imm8[5:4])
dst[255:224] := SELECT4(b[255:128], imm8[7:6])
dst[287:256] := SELECT4(a[383:256], imm8[1:0])
dst[319:288] := SELECT4(a[383:256], imm8[3:2])
dst[351:320] := SELECT4(b[383:256], imm8[5:4])
dst[383:352] := SELECT4(b[383:256], imm8[7:6])
dst[415:384] := SELECT4(a[511:384], imm8[1:0])
dst[447:416] := SELECT4(a[511:384], imm8[3:2])
dst[479:448] := SELECT4(b[511:384], imm8[5:4])
dst[511:480] := SELECT4(b[511:384], imm8[7:6])
dst[MAX:512] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm512_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512SinPd

func M512SinPd(a x86.M512d) (dst x86.M512d)

M512SinPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SIN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sin_pd'. Requires AVX512F.

func M512SinPs

func M512SinPs(a x86.M512) (dst x86.M512)

M512SinPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SIN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sin_ps'. Requires AVX512F.

func M512SincosPd

func M512SincosPd(cos_res *x86.M512d, a x86.M512d) (dst x86.M512d)

M512SincosPd: Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SIN(a[i+63:i])
	cos_res[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sincos_pd'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512SincosPs

func M512SincosPs(cos_res *x86.M512, a x86.M512) (dst x86.M512)

M512SincosPs: Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in 'a' and stores the results of the sine computation in 'dst' and the results of the cosine computation in 'cos_res'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SIN(a[i+31:i])
	cos_res[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sincos_ps'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func M512SindPd

func M512SindPd(a x86.M512d) (dst x86.M512d)

M512SindPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SIND(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sind_pd'. Requires AVX512F.

func M512SindPs

func M512SindPs(a x86.M512) (dst x86.M512)

M512SindPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SIND(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sind_ps'. Requires AVX512F.

func M512SinhPd

func M512SinhPd(a x86.M512d) (dst x86.M512d)

M512SinhPd: Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SINH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sinh_pd'. Requires AVX512F.

func M512SinhPs

func M512SinhPs(a x86.M512) (dst x86.M512)

M512SinhPs: Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SINH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_sinh_ps'. Requires AVX512F.

func M512SllEpi32

func M512SllEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF count[63:0] > 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm512_sll_epi32'. Requires AVX512F.

func M512SllEpi64

func M512SllEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_sll_epi64'. Requires AVX512F.

func M512SlliEpi64

func M512SlliEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm512_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512SllvEpi64

func M512SllvEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm512_sllv_epi64'. Requires AVX512F.

func M512SqrtPd

func M512SqrtPd(a x86.M512d) (dst x86.M512d)

M512SqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_sqrt_pd'. Requires AVX512F.

func M512SqrtPs

func M512SqrtPs(a x86.M512) (dst x86.M512)

M512SqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_sqrt_ps'. Requires AVX512F.

func M512SqrtRoundPd

func M512SqrtRoundPd(a x86.M512d, rounding int) (dst x86.M512d)

M512SqrtRoundPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE.

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := SQRT(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm512_sqrt_round_pd'. Requires AVX512F.

func M512SqrtRoundPs

func M512SqrtRoundPs(a x86.M512, rounding int) (dst x86.M512)

M512SqrtRoundPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE.

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := SQRT(a[i+31:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm512_sqrt_round_ps'. Requires AVX512F.

func M512SraEpi32

func M512SraEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF count[63:0] > 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm512_sra_epi32'. Requires AVX512F.

func M512SraEpi64

func M512SraEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_sra_epi64'. Requires AVX512F.

func M512SraiEpi64

func M512SraiEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm512_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512SravEpi64

func M512SravEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm512_srav_epi64'. Requires AVX512F.

func M512SrlEpi32

func M512SrlEpi32(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	IF count[63:0] > 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm512_srl_epi32'. Requires AVX512F.

func M512SrlEpi64

func M512SrlEpi64(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_srl_epi64'. Requires AVX512F.

func M512SrliEpi64

func M512SrliEpi64(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm512_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512SrlvEpi64

func M512SrlvEpi64(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm512_srlv_epi64'. Requires AVX512F.

func M512SubEpi64

func M512SubEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm512_sub_epi64'. Requires AVX512F.

func M512SvmlRoundPd

func M512SvmlRoundPd(a x86.M512d) (dst x86.M512d)

M512SvmlRoundPd: Round the packed double-precision (64-bit) floating-point elements in 'a' to the nearest integer value, and store the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_svml_round_pd'. Requires AVX512F.

func M512TanPd

func M512TanPd(a x86.M512d) (dst x86.M512d)

M512TanPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := TAN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tan_pd'. Requires AVX512F.

func M512TanPs

func M512TanPs(a x86.M512) (dst x86.M512)

M512TanPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := TAN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tan_ps'. Requires AVX512F.

func M512TandPd

func M512TandPd(a x86.M512d) (dst x86.M512d)

M512TandPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := TAND(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tand_pd'. Requires AVX512F.

func M512TandPs

func M512TandPs(a x86.M512) (dst x86.M512)

M512TandPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := TAND(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tand_ps'. Requires AVX512F.

func M512TanhPd

func M512TanhPd(a x86.M512d) (dst x86.M512d)

M512TanhPd: Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := TANH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tanh_pd'. Requires AVX512F.

func M512TanhPs

func M512TanhPs(a x86.M512) (dst x86.M512)

M512TanhPs: Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := TANH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_tanh_ps'. Requires AVX512F.

func M512TernarylogicEpi32

func M512TernarylogicEpi32(a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)

M512TernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 15
	i := j*32
	FOR h := 0 to 31
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm512_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512TernarylogicEpi64

func M512TernarylogicEpi64(a x86.M512i, b x86.M512i, c x86.M512i, imm8 byte) (dst x86.M512i)

M512TernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 7
	i := j*64
	FOR h := 0 to 63
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm512_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func M512TestEpi64Mask

func M512TestEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512TestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm512_test_epi64_mask'. Requires AVX512F.

func M512TestnEpi32Mask

func M512TestnEpi32Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask16)

M512TestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 15
	i := j*32
	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm512_testn_epi32_mask'. Requires AVX512F.

func M512TestnEpi64Mask

func M512TestnEpi64Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask8)

M512TestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 7
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm512_testn_epi64_mask'. Requires AVX512F.

func M512TruncPd

func M512TruncPd(a x86.M512d) (dst x86.M512d)

M512TruncPd: Truncate the packed double-precision (64-bit) floating-point elements in 'a', and store the results as packed double-precision floating-point elements in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := TRUNCATE(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_trunc_pd'. Requires AVX512F.

func M512TruncPs

func M512TruncPs(a x86.M512) (dst x86.M512)

M512TruncPs: Truncate the packed single-precision (32-bit) floating-point elements in 'a', and store the results as packed single-precision floating-point elements in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := TRUNCATE(a[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: '...'. Intrinsic: '_mm512_trunc_ps'. Requires AVX512F.

func M512Undefined

func M512Undefined() (dst x86.M512)

M512Undefined: Return vector of type __m512 with undefined elements.

Instruction: ”. Intrinsic: '_mm512_undefined'. Requires AVX512F.

func M512UndefinedEpi32

func M512UndefinedEpi32() (dst x86.M512i)

M512UndefinedEpi32: Return vector of type __m512i with undefined elements.

Instruction: ”. Intrinsic: '_mm512_undefined_epi32'. Requires AVX512F.

func M512UndefinedPd

func M512UndefinedPd() (dst x86.M512d)

M512UndefinedPd: Return vector of type __m512d with undefined elements.

Instruction: ”. Intrinsic: '_mm512_undefined_pd'. Requires AVX512F.

func M512UndefinedPs

func M512UndefinedPs() (dst x86.M512)

M512UndefinedPs: Return vector of type __m512 with undefined elements.

Instruction: ”. Intrinsic: '_mm512_undefined_ps'. Requires AVX512F.

func M512UnpackhiEpi32

func M512UnpackhiEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm512_unpackhi_epi32'. Requires AVX512F.

func M512UnpackhiEpi64

func M512UnpackhiEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm512_unpackhi_epi64'. Requires AVX512F.

func M512UnpackhiPd

func M512UnpackhiPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512UnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm512_unpackhi_pd'. Requires AVX512F.

func M512UnpackhiPs

func M512UnpackhiPs(a x86.M512, b x86.M512) (dst x86.M512)

M512UnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm512_unpackhi_ps'. Requires AVX512F.

func M512UnpackloEpi32

func M512UnpackloEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm512_unpacklo_epi32'. Requires AVX512F.

func M512UnpackloEpi64

func M512UnpackloEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm512_unpacklo_epi64'. Requires AVX512F.

func M512UnpackloPd

func M512UnpackloPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512UnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm512_unpacklo_pd'. Requires AVX512F.

func M512UnpackloPs

func M512UnpackloPs(a x86.M512, b x86.M512) (dst x86.M512)

M512UnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm512_unpacklo_ps'. Requires AVX512F.

func Mask2Permutex2varEpi32

func Mask2Permutex2varEpi32(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)

Mask2Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2D'. Intrinsic: '_mm_mask2_permutex2var_epi32'. Requires AVX512F.

func Mask2Permutex2varEpi64

func Mask2Permutex2varEpi64(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)

Mask2Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2Q'. Intrinsic: '_mm_mask2_permutex2var_epi64'. Requires AVX512F.

func Mask2Permutex2varPd

func Mask2Permutex2varPd(a x86.M128d, idx x86.M128i, k x86.Mmask8, b x86.M128d) (dst x86.M128d)

Mask2Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set)

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := idx[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PD'. Intrinsic: '_mm_mask2_permutex2var_pd'. Requires AVX512F.

func Mask2Permutex2varPs

func Mask2Permutex2varPs(a x86.M128, idx x86.M128i, k x86.Mmask8, b x86.M128) (dst x86.M128)

Mask2Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := idx[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PS'. Intrinsic: '_mm_mask2_permutex2var_ps'. Requires AVX512F.

func Mask3FmaddPd

func Mask3FmaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm_mask3_fmadd_pd'. Requires AVX512F.

func Mask3FmaddPs

func Mask3FmaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm_mask3_fmadd_ps'. Requires AVX512F.

func Mask3FmaddRoundSd

func Mask3FmaddRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)

Mask3FmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := c[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask3_fmadd_round_sd'. Requires AVX512F.

func Mask3FmaddRoundSs

func Mask3FmaddRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)

Mask3FmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := c[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask3_fmadd_round_ss'. Requires AVX512F.

func Mask3FmaddSd

func Mask3FmaddSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask3_fmadd_sd'. Requires AVX512F.

func Mask3FmaddSs

func Mask3FmaddSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask3_fmadd_ss'. Requires AVX512F.

func Mask3FmaddsubPd

func Mask3FmaddsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm_mask3_fmaddsub_pd'. Requires AVX512F.

func Mask3FmaddsubPs

func Mask3FmaddsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm_mask3_fmaddsub_ps'. Requires AVX512F.

func Mask3FmsubPd

func Mask3FmsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm_mask3_fmsub_pd'. Requires AVX512F.

func Mask3FmsubPs

func Mask3FmsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm_mask3_fmsub_ps'. Requires AVX512F.

func Mask3FmsubRoundSd

func Mask3FmsubRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)

Mask3FmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := c[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask3_fmsub_round_sd'. Requires AVX512F.

func Mask3FmsubRoundSs

func Mask3FmsubRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)

Mask3FmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := c[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask3_fmsub_round_ss'. Requires AVX512F.

func Mask3FmsubSd

func Mask3FmsubSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask3_fmsub_sd'. Requires AVX512F.

func Mask3FmsubSs

func Mask3FmsubSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask3_fmsub_ss'. Requires AVX512F.

func Mask3FmsubaddPd

func Mask3FmsubaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm_mask3_fmsubadd_pd'. Requires AVX512F.

func Mask3FmsubaddPs

func Mask3FmsubaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm_mask3_fmsubadd_ps'. Requires AVX512F.

func Mask3FnmaddPd

func Mask3FnmaddPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm_mask3_fnmadd_pd'. Requires AVX512F.

func Mask3FnmaddPs

func Mask3FnmaddPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm_mask3_fnmadd_ps'. Requires AVX512F.

func Mask3FnmaddRoundSd

func Mask3FnmaddRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)

Mask3FnmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := c[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask3_fnmadd_round_sd'. Requires AVX512F.

func Mask3FnmaddRoundSs

func Mask3FnmaddRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)

Mask3FnmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := c[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask3_fnmadd_round_ss'. Requires AVX512F.

func Mask3FnmaddSd

func Mask3FnmaddSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FnmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask3_fnmadd_sd'. Requires AVX512F.

func Mask3FnmaddSs

func Mask3FnmaddSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FnmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask3_fnmadd_ss'. Requires AVX512F.

func Mask3FnmsubPd

func Mask3FnmsubPd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm_mask3_fnmsub_pd'. Requires AVX512F.

func Mask3FnmsubPs

func Mask3FnmsubPs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm_mask3_fnmsub_ps'. Requires AVX512F.

func Mask3FnmsubRoundSd

func Mask3FnmsubRoundSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8, rounding int) (dst x86.M128d)

Mask3FnmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := c[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask3_fnmsub_round_sd'. Requires AVX512F.

func Mask3FnmsubRoundSs

func Mask3FnmsubRoundSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8, rounding int) (dst x86.M128)

Mask3FnmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', subtract the lower element in 'c' from the negated intermediate result, store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := c[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask3_fnmsub_round_ss'. Requires AVX512F.

func Mask3FnmsubSd

func Mask3FnmsubSd(a x86.M128d, b x86.M128d, c x86.M128d, k x86.Mmask8) (dst x86.M128d)

Mask3FnmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask3_fnmsub_sd'. Requires AVX512F.

func Mask3FnmsubSs

func Mask3FnmsubSs(a x86.M128, b x86.M128, c x86.M128, k x86.Mmask8) (dst x86.M128)

Mask3FnmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst' using writemask 'k' (elements are copied from 'c' when the corresponding mask bit is not set).

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask3_fnmsub_ss'. Requires AVX512F.

func MaskAbsEpi32

func MaskAbsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm_mask_abs_epi32'. Requires AVX512F.

func MaskAbsEpi64

func MaskAbsEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm_mask_abs_epi64'. Requires AVX512F.

func MaskAddEpi32

func MaskAddEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm_mask_add_epi32'. Requires AVX512F.

func MaskAddEpi64

func MaskAddEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm_mask_add_epi64'. Requires AVX512F.

func MaskAddRoundSd

func MaskAddRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskAddRoundSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] + b[63:0]
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VADDSD'. Intrinsic: '_mm_mask_add_round_sd'. Requires AVX512F.

func MaskAddRoundSs

func MaskAddRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskAddRoundSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] + b[31:0]
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VADDSS'. Intrinsic: '_mm_mask_add_round_ss'. Requires AVX512F.

func MaskAddSd

func MaskAddSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskAddSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] + b[63:0]
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VADDSD'. Intrinsic: '_mm_mask_add_sd'. Requires AVX512F.

func MaskAddSs

func MaskAddSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskAddSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] + b[31:0]
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VADDSS'. Intrinsic: '_mm_mask_add_ss'. Requires AVX512F.

func MaskAndEpi32

func MaskAndEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm_mask_and_epi32'. Requires AVX512F.

func MaskAndEpi64

func MaskAndEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm_mask_and_epi64'. Requires AVX512F.

func MaskAndnotEpi32

func MaskAndnotEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm_mask_andnot_epi32'. Requires AVX512F.

func MaskAndnotEpi64

func MaskAndnotEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm_mask_andnot_epi64'. Requires AVX512F.

func MaskBlendEpi32

func MaskBlendEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskBlendEpi32: Blend packed 32-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBLENDMD'. Intrinsic: '_mm_mask_blend_epi32'. Requires AVX512F.

func MaskBlendEpi64

func MaskBlendEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskBlendEpi64: Blend packed 64-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBLENDMQ'. Intrinsic: '_mm_mask_blend_epi64'. Requires AVX512F.

func MaskBlendPd

func MaskBlendPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskBlendPd: Blend packed double-precision (64-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBLENDMPD'. Intrinsic: '_mm_mask_blend_pd'. Requires AVX512F.

func MaskBlendPs

func MaskBlendPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskBlendPs: Blend packed single-precision (32-bit) floating-point elements from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBLENDMPS'. Intrinsic: '_mm_mask_blend_ps'. Requires AVX512F.

func MaskBroadcastdEpi32

func MaskBroadcastdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_mask_broadcastd_epi32'. Requires AVX512F.

func MaskBroadcastqEpi64

func MaskBroadcastqEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_mask_broadcastq_epi64'. Requires AVX512F.

func MaskBroadcastssPs

func MaskBroadcastssPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm_mask_broadcastss_ps'. Requires AVX512F.

func MaskCmpEpi32Mask

func MaskCmpEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmp_epi32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpi64Mask

func MaskCmpEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmp_epi64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpu32Mask

func MaskCmpEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmp_epu32_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpu64Mask

func MaskCmpEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmp_epu64_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpPdMask

func MaskCmpPdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)

MaskCmpPdMask: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VCMPPD'. Intrinsic: '_mm_mask_cmp_pd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpPsMask

func MaskCmpPsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)

MaskCmpPsMask: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VCMPPS'. Intrinsic: '_mm_mask_cmp_ps_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpRoundSdMask

func MaskCmpRoundSdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, sae int) (dst x86.Mmask8)

MaskCmpRoundSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	IF k1[0]
		k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
	ELSE
		k[0] := 0
	FI
	k[MAX:1] := 0

Instruction: 'VCMPSD'. Intrinsic: '_mm_mask_cmp_round_sd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpRoundSsMask

func MaskCmpRoundSsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, sae int) (dst x86.Mmask8)

MaskCmpRoundSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	CASE (imm8[7:0]) OF
	0: OP := _CMP_EQ_OQ
	1: OP := _CMP_LT_OS
	2: OP := _CMP_LE_OS
	3: OP := _CMP_UNORD_Q
	4: OP := _CMP_NEQ_UQ
	5: OP := _CMP_NLT_US
	6: OP := _CMP_NLE_US
	7: OP := _CMP_ORD_Q
	8: OP := _CMP_EQ_UQ
	9: OP := _CMP_NGE_US
	10: OP := _CMP_NGT_US
	11: OP := _CMP_FALSE_OQ
	12: OP := _CMP_NEQ_OQ
	13: OP := _CMP_GE_OS
	14: OP := _CMP_GT_OS
	15: OP := _CMP_TRUE_UQ
	16: OP := _CMP_EQ_OS
	17: OP := _CMP_LT_OQ
	18: OP := _CMP_LE_OQ
	19: OP := _CMP_UNORD_S
	20: OP := _CMP_NEQ_US
	21: OP := _CMP_NLT_UQ
	22: OP := _CMP_NLE_UQ
	23: OP := _CMP_ORD_S
	24: OP := _CMP_EQ_US
	25: OP := _CMP_NGE_UQ
	26: OP := _CMP_NGT_UQ
	27: OP := _CMP_FALSE_OS
	28: OP := _CMP_NEQ_OS
	29: OP := _CMP_GE_OQ
	30: OP := _CMP_GT_OQ
	31: OP := _CMP_TRUE_US
	ESAC

	IF k1[0]
		k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
	ELSE
		k[0] := 0
	FI
	k[MAX:1] := 0

Instruction: 'VCMPSS'. Intrinsic: '_mm_mask_cmp_round_ss_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpSdMask

func MaskCmpSdMask(k1 x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.Mmask8)

MaskCmpSdMask: Compare the lower double-precision (64-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC

IF k1[0]
	k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
ELSE
	k[0] := 0
FI
k[MAX:1] := 0

Instruction: 'VCMPSD'. Intrinsic: '_mm_mask_cmp_sd_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpSsMask

func MaskCmpSsMask(k1 x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.Mmask8)

MaskCmpSsMask: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC

IF k1[0]
	k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
ELSE
	k[0] := 0
FI
k[MAX:1] := 0

Instruction: 'VCMPSS'. Intrinsic: '_mm_mask_cmp_ss_mask'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskCmpeqEpi32Mask

func MaskCmpeqEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpeq_epi32_mask'. Requires AVX512F.

func MaskCmpeqEpi64Mask

func MaskCmpeqEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpeq_epi64_mask'. Requires AVX512F.

func MaskCmpeqEpu32Mask

func MaskCmpeqEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpeq_epu32_mask'. Requires AVX512F.

func MaskCmpeqEpu64Mask

func MaskCmpeqEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpeq_epu64_mask'. Requires AVX512F.

func MaskCmpgeEpi32Mask

func MaskCmpgeEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpge_epi32_mask'. Requires AVX512F.

func MaskCmpgeEpi64Mask

func MaskCmpgeEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpge_epi64_mask'. Requires AVX512F.

func MaskCmpgeEpu32Mask

func MaskCmpgeEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpge_epu32_mask'. Requires AVX512F.

func MaskCmpgeEpu64Mask

func MaskCmpgeEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpge_epu64_mask'. Requires AVX512F.

func MaskCmpgtEpi32Mask

func MaskCmpgtEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpgt_epi32_mask'. Requires AVX512F.

func MaskCmpgtEpi64Mask

func MaskCmpgtEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpgt_epi64_mask'. Requires AVX512F.

func MaskCmpgtEpu32Mask

func MaskCmpgtEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpgt_epu32_mask'. Requires AVX512F.

func MaskCmpgtEpu64Mask

func MaskCmpgtEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpgt_epu64_mask'. Requires AVX512F.

func MaskCmpleEpi32Mask

func MaskCmpleEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmple_epi32_mask'. Requires AVX512F.

func MaskCmpleEpi64Mask

func MaskCmpleEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmple_epi64_mask'. Requires AVX512F.

func MaskCmpleEpu32Mask

func MaskCmpleEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmple_epu32_mask'. Requires AVX512F.

func MaskCmpleEpu64Mask

func MaskCmpleEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmple_epu64_mask'. Requires AVX512F.

func MaskCmpltEpi32Mask

func MaskCmpltEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmplt_epi32_mask'. Requires AVX512F.

func MaskCmpltEpi64Mask

func MaskCmpltEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmplt_epi64_mask'. Requires AVX512F.

func MaskCmpltEpu32Mask

func MaskCmpltEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmplt_epu32_mask'. Requires AVX512F.

func MaskCmpltEpu64Mask

func MaskCmpltEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmplt_epu64_mask'. Requires AVX512F.

func MaskCmpneqEpi32Mask

func MaskCmpneqEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpi32Mask: Compare packed 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPD'. Intrinsic: '_mm_mask_cmpneq_epi32_mask'. Requires AVX512F.

func MaskCmpneqEpi64Mask

func MaskCmpneqEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpi64Mask: Compare packed 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPQ'. Intrinsic: '_mm_mask_cmpneq_epi64_mask'. Requires AVX512F.

func MaskCmpneqEpu32Mask

func MaskCmpneqEpu32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpu32Mask: Compare packed unsigned 32-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPCMPUD'. Intrinsic: '_mm_mask_cmpneq_epu32_mask'. Requires AVX512F.

func MaskCmpneqEpu64Mask

func MaskCmpneqEpu64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpu64Mask: Compare packed unsigned 64-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPCMPUQ'. Intrinsic: '_mm_mask_cmpneq_epu64_mask'. Requires AVX512F.

func MaskCompressEpi32

func MaskCompressEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm_mask_compress_epi32'. Requires AVX512F.

func MaskCompressEpi64

func MaskCompressEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm_mask_compress_epi64'. Requires AVX512F.

func MaskCompressPd

func MaskCompressPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 64
m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm_mask_compress_pd'. Requires AVX512F.

func MaskCompressPs

func MaskCompressPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in writemask 'k') to 'dst', and pass through the remaining elements from 'src'.

size := 32
m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm_mask_compress_ps'. Requires AVX512F.

func MaskCvtRoundpsPh

func MaskCvtRoundpsPh(src x86.M128i, k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)

MaskCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 3
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:64] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_mask_cvt_roundps_ph'. Requires AVX512F.

func MaskCvtRoundsdSs

func MaskCvtRoundsdSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128d, rounding int) (dst x86.M128)

MaskCvtRoundsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := Convert_FP64_To_FP32(b[63:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:31]
		dst[MAX:64] := 0

Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_mask_cvt_roundsd_ss'. Requires AVX512F.

func MaskCvtRoundssSd

func MaskCvtRoundssSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)

MaskCvtRoundssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := Convert_FP32_To_FP64(b[31:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:64] := 0

Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_mask_cvt_roundss_sd'. Requires AVX512F.

func MaskCvtepi16Epi32

func MaskCvtepi16Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	l := j*16
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm_mask_cvtepi16_epi32'. Requires AVX512F.

func MaskCvtepi16Epi64

func MaskCvtepi16Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi16Epi64: Sign extend packed 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm_mask_cvtepi16_epi64'. Requires AVX512F.

func MaskCvtepi32Epi16

func MaskCvtepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm_mask_cvtepi32_epi16'. Requires AVX512F.

func MaskCvtepi32Epi64

func MaskCvtepi32Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm_mask_cvtepi32_epi64'. Requires AVX512F.

func MaskCvtepi32Epi8

func MaskCvtepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm_mask_cvtepi32_epi8'. Requires AVX512F.

func MaskCvtepi32Pd

func MaskCvtepi32Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := src[m+63:m]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm_mask_cvtepi32_pd'. Requires AVX512F.

func MaskCvtepi32Ps

func MaskCvtepi32Ps(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm_mask_cvtepi32_ps'. Requires AVX512F.

func MaskCvtepi64Epi16

func MaskCvtepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm_mask_cvtepi64_epi16'. Requires AVX512F.

func MaskCvtepi64Epi32

func MaskCvtepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm_mask_cvtepi64_epi32'. Requires AVX512F.

func MaskCvtepi64Epi8

func MaskCvtepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm_mask_cvtepi64_epi8'. Requires AVX512F.

func MaskCvtepi8Epi32

func MaskCvtepi8Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi8Epi32: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm_mask_cvtepi8_epi32'. Requires AVX512F.

func MaskCvtepi8Epi64

func MaskCvtepi8Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi8Epi64: Sign extend packed 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm_mask_cvtepi8_epi64'. Requires AVX512F.

func MaskCvtepu16Epi32

func MaskCvtepu16Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm_mask_cvtepu16_epi32'. Requires AVX512F.

func MaskCvtepu16Epi64

func MaskCvtepu16Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm_mask_cvtepu16_epi64'. Requires AVX512F.

func MaskCvtepu32Epi64

func MaskCvtepu32Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm_mask_cvtepu32_epi64'. Requires AVX512F.

func MaskCvtepu32Pd

func MaskCvtepu32Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm_mask_cvtepu32_pd'. Requires AVX512F.

func MaskCvtepu8Epi32

func MaskCvtepu8Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in the low 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm_mask_cvtepu8_epi32'. Requires AVX512F.

func MaskCvtepu8Epi64

func MaskCvtepu8Epi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm_mask_cvtepu8_epi64'. Requires AVX512F.

func MaskCvtpdEpi32

func MaskCvtpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm_mask_cvtpd_epi32'. Requires AVX512F.

func MaskCvtpdEpu32

func MaskCvtpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm_mask_cvtpd_epu32'. Requires AVX512F.

func MaskCvtpdPs

func MaskCvtpdPs(src x86.M128, k x86.Mmask8, a x86.M128d) (dst x86.M128)

MaskCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm_mask_cvtpd_ps'. Requires AVX512F.

func MaskCvtphPs

func MaskCvtphPs(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm_mask_cvtph_ps'. Requires AVX512F.

func MaskCvtpsEpi32

func MaskCvtpsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm_mask_cvtps_epi32'. Requires AVX512F.

func MaskCvtpsEpu32

func MaskCvtpsEpu32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm_mask_cvtps_epu32'. Requires AVX512F.

func MaskCvtpsPh

func MaskCvtpsPh(src x86.M128i, k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)

MaskCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 3
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := src[i+15:i]
			FI
		ENDFOR
		dst[MAX:64] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_mask_cvtps_ph'. Requires AVX512F.

func MaskCvtsdSs

func MaskCvtsdSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128d) (dst x86.M128)

MaskCvtsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:31]
dst[MAX:64] := 0

Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_mask_cvtsd_ss'. Requires AVX512F.

func MaskCvtsepi32Epi16

func MaskCvtsepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm_mask_cvtsepi32_epi16'. Requires AVX512F.

func MaskCvtsepi32Epi8

func MaskCvtsepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm_mask_cvtsepi32_epi8'. Requires AVX512F.

func MaskCvtsepi64Epi16

func MaskCvtsepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm_mask_cvtsepi64_epi16'. Requires AVX512F.

func MaskCvtsepi64Epi32

func MaskCvtsepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm_mask_cvtsepi64_epi32'. Requires AVX512F.

func MaskCvtsepi64Epi8

func MaskCvtsepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm_mask_cvtsepi64_epi8'. Requires AVX512F.

func MaskCvtssSd

func MaskCvtssSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128) (dst x86.M128d)

MaskCvtssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:64] := 0

Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_mask_cvtss_sd'. Requires AVX512F.

func MaskCvttpdEpi32

func MaskCvttpdEpi32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm_mask_cvttpd_epi32'. Requires AVX512F.

func MaskCvttpdEpu32

func MaskCvttpdEpu32(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm_mask_cvttpd_epu32'. Requires AVX512F.

func MaskCvttpsEpi32

func MaskCvttpsEpi32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm_mask_cvttps_epi32'. Requires AVX512F.

func MaskCvttpsEpu32

func MaskCvttpsEpu32(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm_mask_cvttps_epu32'. Requires AVX512F.

func MaskCvtusepi32Epi16

func MaskCvtusepi32Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm_mask_cvtusepi32_epi16'. Requires AVX512F.

func MaskCvtusepi32Epi8

func MaskCvtusepi32Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm_mask_cvtusepi32_epi8'. Requires AVX512F.

func MaskCvtusepi64Epi16

func MaskCvtusepi64Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm_mask_cvtusepi64_epi16'. Requires AVX512F.

func MaskCvtusepi64Epi32

func MaskCvtusepi64Epi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm_mask_cvtusepi64_epi32'. Requires AVX512F.

func MaskCvtusepi64Epi8

func MaskCvtusepi64Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm_mask_cvtusepi64_epi8'. Requires AVX512F.

func MaskDivPd

func MaskDivPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm_mask_div_pd'. Requires AVX512F.

func MaskDivPs

func MaskDivPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm_mask_div_ps'. Requires AVX512F.

func MaskDivRoundSd

func MaskDivRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskDivRoundSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] / b[63:0]
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VDIVSD'. Intrinsic: '_mm_mask_div_round_sd'. Requires AVX512F.

func MaskDivRoundSs

func MaskDivRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskDivRoundSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] / b[31:0]
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VDIVSS'. Intrinsic: '_mm_mask_div_round_ss'. Requires AVX512F.

func MaskDivSd

func MaskDivSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskDivSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] / b[63:0]
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VDIVSD'. Intrinsic: '_mm_mask_div_sd'. Requires AVX512F.

func MaskDivSs

func MaskDivSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskDivSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] / b[31:0]
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VDIVSS'. Intrinsic: '_mm_mask_div_ss'. Requires AVX512F.

func MaskExpandEpi32

func MaskExpandEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm_mask_expand_epi32'. Requires AVX512F.

func MaskExpandEpi64

func MaskExpandEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm_mask_expand_epi64'. Requires AVX512F.

func MaskExpandPd

func MaskExpandPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm_mask_expand_pd'. Requires AVX512F.

func MaskExpandPs

func MaskExpandPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm_mask_expand_ps'. Requires AVX512F.

func MaskFixupimmPd

func MaskFixupimmPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

MaskFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm_mask_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFixupimmPs

func MaskFixupimmPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

MaskFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm_mask_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFixupimmRoundSd

func MaskFixupimmRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)

MaskFixupimmRoundSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		IF k[0]
			dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
		ELSE
			dst[63:0] := a[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_mask_fixupimm_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFixupimmRoundSs

func MaskFixupimmRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)

MaskFixupimmRoundSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		IF k[0]
			dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
		ELSE
			dst[31:0] := a[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_mask_fixupimm_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFixupimmSd

func MaskFixupimmSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

MaskFixupimmSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

IF k[0]
	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
ELSE
	dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_mask_fixupimm_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFixupimmSs

func MaskFixupimmSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

MaskFixupimmSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

IF k[0]
	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
ELSE
	dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_mask_fixupimm_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskFmaddPd

func MaskFmaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm_mask_fmadd_pd'. Requires AVX512F.

func MaskFmaddPs

func MaskFmaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm_mask_fmadd_ps'. Requires AVX512F.

func MaskFmaddRoundSd

func MaskFmaddRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskFmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := a[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask_fmadd_round_sd'. Requires AVX512F.

func MaskFmaddRoundSs

func MaskFmaddRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskFmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := a[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask_fmadd_round_ss'. Requires AVX512F.

func MaskFmaddSd

func MaskFmaddSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_mask_fmadd_sd'. Requires AVX512F.

func MaskFmaddSs

func MaskFmaddSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_mask_fmadd_ss'. Requires AVX512F.

func MaskFmaddsubPd

func MaskFmaddsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm_mask_fmaddsub_pd'. Requires AVX512F.

func MaskFmaddsubPs

func MaskFmaddsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm_mask_fmaddsub_ps'. Requires AVX512F.

func MaskFmsubPd

func MaskFmsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm_mask_fmsub_pd'. Requires AVX512F.

func MaskFmsubPs

func MaskFmsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm_mask_fmsub_ps'. Requires AVX512F.

func MaskFmsubRoundSd

func MaskFmsubRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskFmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := a[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask_fmsub_round_sd'. Requires AVX512F.

func MaskFmsubRoundSs

func MaskFmsubRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskFmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := a[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask_fmsub_round_ss'. Requires AVX512F.

func MaskFmsubSd

func MaskFmsubSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_mask_fmsub_sd'. Requires AVX512F.

func MaskFmsubSs

func MaskFmsubSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_mask_fmsub_ss'. Requires AVX512F.

func MaskFmsubaddPd

func MaskFmsubaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm_mask_fmsubadd_pd'. Requires AVX512F.

func MaskFmsubaddPs

func MaskFmsubaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm_mask_fmsubadd_ps'. Requires AVX512F.

func MaskFnmaddPd

func MaskFnmaddPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm_mask_fnmadd_pd'. Requires AVX512F.

func MaskFnmaddPs

func MaskFnmaddPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm_mask_fnmadd_ps'. Requires AVX512F.

func MaskFnmaddRoundSd

func MaskFnmaddRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskFnmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := a[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask_fnmadd_round_sd'. Requires AVX512F.

func MaskFnmaddRoundSs

func MaskFnmaddRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskFnmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := a[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask_fnmadd_round_ss'. Requires AVX512F.

func MaskFnmaddSd

func MaskFnmaddSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFnmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_mask_fnmadd_sd'. Requires AVX512F.

func MaskFnmaddSs

func MaskFnmaddSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFnmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'a' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_mask_fnmadd_ss'. Requires AVX512F.

func MaskFnmsubPd

func MaskFnmsubPd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm_mask_fnmsub_pd'. Requires AVX512F.

func MaskFnmsubPs

func MaskFnmsubPs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm_mask_fnmsub_ps'. Requires AVX512F.

func MaskFnmsubRoundSd

func MaskFnmsubRoundSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskFnmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := a[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask_fnmsub_round_sd'. Requires AVX512F.

func MaskFnmsubRoundSs

func MaskFnmsubRoundSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskFnmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := a[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask_fnmsub_round_ss'. Requires AVX512F.

func MaskFnmsubSd

func MaskFnmsubSd(a x86.M128d, k x86.Mmask8, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskFnmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_mask_fnmsub_sd'. Requires AVX512F.

func MaskFnmsubSs

func MaskFnmsubSs(a x86.M128, k x86.Mmask8, b x86.M128, c x86.M128) (dst x86.M128)

MaskFnmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'c' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_mask_fnmsub_ss'. Requires AVX512F.

func MaskGetexpPd

func MaskGetexpPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm_mask_getexp_pd'. Requires AVX512F.

func MaskGetexpPs

func MaskGetexpPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm_mask_getexp_ps'. Requires AVX512F.

func MaskGetexpRoundSd

func MaskGetexpRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskGetexpRoundSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := ConvertExpFP64(b[63:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_mask_getexp_round_sd'. Requires AVX512F.

func MaskGetexpRoundSs

func MaskGetexpRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskGetexpRoundSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := ConvertExpFP32(b[31:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_mask_getexp_round_ss'. Requires AVX512F.

func MaskGetexpSd

func MaskGetexpSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskGetexpSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

IF k[0]
	dst[63:0] := ConvertExpFP64(b[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_mask_getexp_sd'. Requires AVX512F.

func MaskGetexpSs

func MaskGetexpSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskGetexpSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

IF k[0]
	dst[31:0] := ConvertExpFP32(b[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_mask_getexp_ss'. Requires AVX512F.

func MaskGetmantPd

func MaskGetmantPd(src x86.M128d, k x86.Mmask8, a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

MaskGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 1
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm_mask_getmant_pd'. Requires AVX512F.

func MaskGetmantPs

func MaskGetmantPs(src x86.M128, k x86.Mmask8, a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

MaskGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm_mask_getmant_ps'. Requires AVX512F.

func MaskGetmantRoundSd

func MaskGetmantRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128d)

MaskGetmantRoundSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_mask_getmant_round_sd'. Requires AVX512F.

func MaskGetmantRoundSs

func MaskGetmantRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128)

MaskGetmantRoundSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_mask_getmant_round_ss'. Requires AVX512F.

func MaskGetmantSd

func MaskGetmantSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

MaskGetmantSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		IF k[0]
			dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_mask_getmant_sd'. Requires AVX512F.

func MaskGetmantSs

func MaskGetmantSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

MaskGetmantSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		IF k[0]
			dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_mask_getmant_ss'. Requires AVX512F.

func MaskLoadSd

func MaskLoadSd(src x86.M128d, k x86.Mmask8, mem_addr *float64) (dst x86.M128d)

MaskLoadSd: Load a double-precision (64-bit) floating-point element from memory into the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and set the upper element of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

IF k[0]
	dst[63:0] := MEM[mem_addr+63:mem_addr]
ELSE
	dst[63:0] := src[63:0]
FI
dst[MAX:64] := 0

Instruction: 'VMOVSD'. Intrinsic: '_mm_mask_load_sd'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskLoadSs

func MaskLoadSs(src x86.M128, k x86.Mmask8, mem_addr *float32) (dst x86.M128)

MaskLoadSs: Load a single-precision (32-bit) floating-point element from memory into the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and set the upper elements of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

IF k[0]
	dst[31:0] := MEM[mem_addr+31:mem_addr]
ELSE
	dst[31:0] := src[31:0]
FI
dst[MAX:32] := 0

Instruction: 'VMOVSS'. Intrinsic: '_mm_mask_load_ss'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskMaxEpi32

func MaskMaxEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm_mask_max_epi32'. Requires AVX512F.

func MaskMaxEpi64

func MaskMaxEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm_mask_max_epi64'. Requires AVX512F.

func MaskMaxEpu32

func MaskMaxEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm_mask_max_epu32'. Requires AVX512F.

func MaskMaxEpu64

func MaskMaxEpu64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm_mask_max_epu64'. Requires AVX512F.

func MaskMaxPd

func MaskMaxPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm_mask_max_pd'. Requires AVX512F.

func MaskMaxPs

func MaskMaxPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm_mask_max_ps'. Requires AVX512F.

func MaskMaxRoundSd

func MaskMaxRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MaskMaxRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[63:0] := MAX(a[63:0], b[63:0])
	ELSE
		dst[63:0] := src[63:0]
	FI
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMAXSD'. Intrinsic: '_mm_mask_max_round_sd'. Requires AVX512F.

func MaskMaxRoundSs

func MaskMaxRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)

MaskMaxRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[31:0] := MAX(a[31:0], b[31:0])
	ELSE
		dst[31:0] := src[31:0]
	FI
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMAXSS'. Intrinsic: '_mm_mask_max_round_ss'. Requires AVX512F.

func MaskMaxSd

func MaskMaxSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMaxSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := MAX(a[63:0], b[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMAXSD'. Intrinsic: '_mm_mask_max_sd'. Requires AVX512F.

func MaskMaxSs

func MaskMaxSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMaxSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[31:0] := MAX(a[31:0], b[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMAXSS'. Intrinsic: '_mm_mask_max_ss'. Requires AVX512F.

func MaskMinEpi32

func MaskMinEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm_mask_min_epi32'. Requires AVX512F.

func MaskMinEpi64

func MaskMinEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm_mask_min_epi64'. Requires AVX512F.

func MaskMinEpu32

func MaskMinEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm_mask_min_epu32'. Requires AVX512F.

func MaskMinEpu64

func MaskMinEpu64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm_mask_min_epu64'. Requires AVX512F.

func MaskMinPd

func MaskMinPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm_mask_min_pd'. Requires AVX512F.

func MaskMinPs

func MaskMinPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm_mask_min_ps'. Requires AVX512F.

func MaskMinRoundSd

func MaskMinRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MaskMinRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[63:0] := MIN(a[63:0], b[63:0])
	ELSE
		dst[63:0] := src[63:0]
	FI
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMINSD'. Intrinsic: '_mm_mask_min_round_sd'. Requires AVX512F.

func MaskMinRoundSs

func MaskMinRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)

MaskMinRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[31:0] := MIN(a[31:0], b[31:0])
	ELSE
		dst[31:0] := src[31:0]
	FI
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMINSS'. Intrinsic: '_mm_mask_min_round_ss'. Requires AVX512F.

func MaskMinSd

func MaskMinSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMinSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := MIN(a[63:0], b[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMINSD'. Intrinsic: '_mm_mask_min_sd'. Requires AVX512F.

func MaskMinSs

func MaskMinSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMinSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[31:0] := MIN(a[31:0], b[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMINSS'. Intrinsic: '_mm_mask_min_ss'. Requires AVX512F.

func MaskMovEpi32

func MaskMovEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskMovEpi32: Move packed 32-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm_mask_mov_epi32'. Requires AVX512F.

func MaskMovEpi64

func MaskMovEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskMovEpi64: Move packed 64-bit integers from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm_mask_mov_epi64'. Requires AVX512F.

func MaskMovPd

func MaskMovPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm_mask_mov_pd'. Requires AVX512F.

func MaskMovPs

func MaskMovPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm_mask_mov_ps'. Requires AVX512F.

func MaskMoveSd

func MaskMoveSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMoveSd: Move the lower double-precision (64-bit) floating-point element from 'b' to the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := b[63:0]
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMOVSD'. Intrinsic: '_mm_mask_move_sd'. Requires AVX512F.

func MaskMoveSs

func MaskMoveSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMoveSs: Move the lower single-precision (32-bit) floating-point element from 'b' to the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := b[31:0]
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMOVSS'. Intrinsic: '_mm_mask_move_ss'. Requires AVX512F.

func MaskMovedupPd

func MaskMovedupPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm_mask_movedup_pd'. Requires AVX512F.

func MaskMovehdupPs

func MaskMovehdupPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm_mask_movehdup_ps'. Requires AVX512F.

func MaskMoveldupPs

func MaskMoveldupPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm_mask_moveldup_ps'. Requires AVX512F.

func MaskMulEpi32

func MaskMulEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm_mask_mul_epi32'. Requires AVX512F.

func MaskMulEpu32

func MaskMulEpu32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm_mask_mul_epu32'. Requires AVX512F.

func MaskMulPd

func MaskMulPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm_mask_mul_pd'. Requires AVX512F.

func MaskMulPs

func MaskMulPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). RM.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm_mask_mul_ps'. Requires AVX512F.

func MaskMulRoundSd

func MaskMulRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskMulRoundSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] * b[63:0]
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VMULSD'. Intrinsic: '_mm_mask_mul_round_sd'. Requires AVX512F.

func MaskMulRoundSs

func MaskMulRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskMulRoundSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] * b[31:0]
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VMULSS'. Intrinsic: '_mm_mask_mul_round_ss'. Requires AVX512F.

func MaskMulSd

func MaskMulSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskMulSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] * b[63:0]
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMULSD'. Intrinsic: '_mm_mask_mul_sd'. Requires AVX512F.

func MaskMulSs

func MaskMulSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskMulSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] * b[31:0]
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMULSS'. Intrinsic: '_mm_mask_mul_ss'. Requires AVX512F.

func MaskMulloEpi32

func MaskMulloEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm_mask_mullo_epi32'. Requires AVX512F.

func MaskOrEpi32

func MaskOrEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPORD'. Intrinsic: '_mm_mask_or_epi32'. Requires AVX512F.

func MaskOrEpi64

func MaskOrEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm_mask_or_epi64'. Requires AVX512F.

func MaskPermutePd

func MaskPermutePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm_mask_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskPermutePs

func MaskPermutePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm_mask_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskPermutevarPd

func MaskPermutevarPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128i) (dst x86.M128d)

MaskPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm_mask_permutevar_pd'. Requires AVX512F.

func MaskPermutevarPs

func MaskPermutevarPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128i) (dst x86.M128)

MaskPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm_mask_permutevar_ps'. Requires AVX512F.

func MaskPermutex2varEpi32

func MaskPermutex2varEpi32(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMT2D'. Intrinsic: '_mm_mask_permutex2var_epi32'. Requires AVX512F.

func MaskPermutex2varEpi64

func MaskPermutex2varEpi64(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMT2Q'. Intrinsic: '_mm_mask_permutex2var_epi64'. Requires AVX512F.

func MaskPermutex2varPd

func MaskPermutex2varPd(a x86.M128d, k x86.Mmask8, idx x86.M128i, b x86.M128d) (dst x86.M128d)

MaskPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMT2PD'. Intrinsic: '_mm_mask_permutex2var_pd'. Requires AVX512F.

func MaskPermutex2varPs

func MaskPermutex2varPs(a x86.M128, k x86.Mmask8, idx x86.M128i, b x86.M128) (dst x86.M128)

MaskPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMT2PS'. Intrinsic: '_mm_mask_permutex2var_ps'. Requires AVX512F.

func MaskRcp14Pd

func MaskRcp14Pd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm_mask_rcp14_pd'. Requires AVX512F.

func MaskRcp14Ps

func MaskRcp14Ps(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm_mask_rcp14_ps'. Requires AVX512F.

func MaskRcp14Sd

func MaskRcp14Sd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskRcp14Sd: Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[63:0] := APPROXIMATE(1.0/b[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRCP14SD'. Intrinsic: '_mm_mask_rcp14_sd'. Requires AVX512F.

func MaskRcp14Ss

func MaskRcp14Ss(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskRcp14Ss: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[31:0] := APPROXIMATE(1.0/b[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRCP14SS'. Intrinsic: '_mm_mask_rcp14_ss'. Requires AVX512F.

func MaskRolEpi32

func MaskRolEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm_mask_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRolEpi64

func MaskRolEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm_mask_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRolvEpi32

func MaskRolvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm_mask_rolv_epi32'. Requires AVX512F.

func MaskRolvEpi64

func MaskRolvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm_mask_rolv_epi64'. Requires AVX512F.

func MaskRorEpi32

func MaskRorEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm_mask_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRorEpi64

func MaskRorEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm_mask_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRorvEpi32

func MaskRorvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm_mask_rorv_epi32'. Requires AVX512F.

func MaskRorvEpi64

func MaskRorvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm_mask_rorv_epi64'. Requires AVX512F.

func MaskRoundscalePd

func MaskRoundscalePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm_mask_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRoundscalePs

func MaskRoundscalePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm_mask_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRoundscaleRoundSd

func MaskRoundscaleRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskRoundscaleRoundSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		IF k[0]
			dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_mask_roundscale_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRoundscaleRoundSs

func MaskRoundscaleRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskRoundscaleRoundSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		IF k[0]
			dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_mask_roundscale_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRoundscaleSd

func MaskRoundscaleSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskRoundscaleSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

IF k[0]
	dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_mask_roundscale_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRoundscaleSs

func MaskRoundscaleSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskRoundscaleSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

IF k[0]
	dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_mask_roundscale_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskRsqrt14Pd

func MaskRsqrt14Pd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm_mask_rsqrt14_pd'. Requires AVX512F.

func MaskRsqrt14Ps

func MaskRsqrt14Ps(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm_mask_rsqrt14_ps'. Requires AVX512F.

func MaskRsqrt14Sd

func MaskRsqrt14Sd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskRsqrt14Sd: Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SD'. Intrinsic: '_mm_mask_rsqrt14_sd'. Requires AVX512F.

func MaskRsqrt14Ss

func MaskRsqrt14Ss(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskRsqrt14Ss: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SS'. Intrinsic: '_mm_mask_rsqrt14_ss'. Requires AVX512F.

func MaskScalefPd

func MaskScalefPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm_mask_scalef_pd'. Requires AVX512F.

func MaskScalefPs

func MaskScalefPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm_mask_scalef_ps'. Requires AVX512F.

func MaskScalefRoundSd

func MaskScalefRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskScalefRoundSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		IF k[0]
			dst[63:0] := SCALE(a[63:0], b[63:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_mask_scalef_round_sd'. Requires AVX512F.

func MaskScalefRoundSs

func MaskScalefRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskScalefRoundSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[63:0]
		}

		IF k[0]
			dst[31:0] := SCALE(a[31:0], b[31:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_mask_scalef_round_ss'. Requires AVX512F.

func MaskScalefSd

func MaskScalefSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskScalefSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

IF k[0]
	dst[63:0] := SCALE(a[63:0], b[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_mask_scalef_sd'. Requires AVX512F.

func MaskScalefSs

func MaskScalefSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskScalefSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[63:0]
}

IF k[0]
	dst[31:0] := SCALE(a[31:0], b[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_mask_scalef_ss'. Requires AVX512F.

func MaskSet1Epi32

func MaskSet1Epi32(src x86.M128i, k x86.Mmask8, a int) (dst x86.M128i)

MaskSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_mask_set1_epi32'. Requires AVX512F.

func MaskSet1Epi64

func MaskSet1Epi64(src x86.M128i, k x86.Mmask8, a int64) (dst x86.M128i)

MaskSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_mask_set1_epi64'. Requires AVX512F.

func MaskShuffleEpi32

func MaskShuffleEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskShuffleEpi32: Shuffle 32-bit integers in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm_mask_shuffle_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskShufflePd

func MaskShufflePd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskShufflePd: Shuffle double-precision (64-bit) floating-point elements using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm_mask_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskShufflePs

func MaskShufflePs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm_mask_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSllEpi32

func MaskSllEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm_mask_sll_epi32'. Requires AVX512F.

func MaskSllEpi64

func MaskSllEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm_mask_sll_epi64'. Requires AVX512F.

func MaskSlliEpi32

func MaskSlliEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm_mask_slli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSlliEpi64

func MaskSlliEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm_mask_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSllvEpi32

func MaskSllvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm_mask_sllv_epi32'. Requires AVX512F.

func MaskSllvEpi64

func MaskSllvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm_mask_sllv_epi64'. Requires AVX512F.

func MaskSqrtPd

func MaskSqrtPd(src x86.M128d, k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm_mask_sqrt_pd'. Requires AVX512F.

func MaskSqrtPs

func MaskSqrtPs(src x86.M128, k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm_mask_sqrt_ps'. Requires AVX512F.

func MaskSqrtRoundSd

func MaskSqrtRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskSqrtRoundSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := SQRT(a[63:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSQRTSD'. Intrinsic: '_mm_mask_sqrt_round_sd'. Requires AVX512F.

func MaskSqrtRoundSs

func MaskSqrtRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskSqrtRoundSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := SQRT(a[31:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSQRTSS'. Intrinsic: '_mm_mask_sqrt_round_ss'. Requires AVX512F.

func MaskSqrtSd

func MaskSqrtSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskSqrtSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := SQRT(a[63:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VSQRTSD'. Intrinsic: '_mm_mask_sqrt_sd'. Requires AVX512F.

func MaskSqrtSs

func MaskSqrtSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskSqrtSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := SQRT(a[31:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VSQRTSS'. Intrinsic: '_mm_mask_sqrt_ss'. Requires AVX512F.

func MaskSraEpi32

func MaskSraEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm_mask_sra_epi32'. Requires AVX512F.

func MaskSraEpi64

func MaskSraEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_mask_sra_epi64'. Requires AVX512F.

func MaskSraiEpi32

func MaskSraiEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm_mask_srai_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSraiEpi64

func MaskSraiEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_mask_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSravEpi32

func MaskSravEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm_mask_srav_epi32'. Requires AVX512F.

func MaskSravEpi64

func MaskSravEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm_mask_srav_epi64'. Requires AVX512F.

func MaskSrlEpi32

func MaskSrlEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm_mask_srl_epi32'. Requires AVX512F.

func MaskSrlEpi64

func MaskSrlEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm_mask_srl_epi64'. Requires AVX512F.

func MaskSrliEpi32

func MaskSrliEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm_mask_srli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSrliEpi64

func MaskSrliEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm_mask_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskSrlvEpi32

func MaskSrlvEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm_mask_srlv_epi32'. Requires AVX512F.

func MaskSrlvEpi64

func MaskSrlvEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm_mask_srlv_epi64'. Requires AVX512F.

func MaskStoreSd

func MaskStoreSd(mem_addr *float64, k x86.Mmask8, a x86.M128d)

MaskStoreSd: Store the lower double-precision (64-bit) floating-point element from 'a' into memory using writemask 'k'.

'mem_addr' must be aligned on a 16-byte boundary or a general-protection

exception may be generated.

IF k[0]
	MEM[mem_addr+63:mem_addr] := a[63:0]
FI

Instruction: 'VMOVSD'. Intrinsic: '_mm_mask_store_sd'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskStoreSs

func MaskStoreSs(mem_addr *float32, k x86.Mmask8, a x86.M128)

MaskStoreSs: Store the lower single-precision (32-bit) floating-point element from 'a' into memory using writemask 'k'.

'mem_addr' must be aligned on a 16-byte boundary or a general-protection

exception may be generated.

IF k[0]
	MEM[mem_addr+31:mem_addr] := a[31:0]
FI

Instruction: 'VMOVSS'. Intrinsic: '_mm_mask_store_ss'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskSubEpi32

func MaskSubEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm_mask_sub_epi32'. Requires AVX512F.

func MaskSubEpi64

func MaskSubEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm_mask_sub_epi64'. Requires AVX512F.

func MaskSubPd

func MaskSubPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm_mask_sub_pd'. Requires AVX512F.

func MaskSubPs

func MaskSubPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm_mask_sub_ps'. Requires AVX512F.

func MaskSubRoundSd

func MaskSubRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskSubRoundSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] - b[63:0]
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VSUBSD'. Intrinsic: '_mm_mask_sub_round_sd'. Requires AVX512F.

func MaskSubRoundSs

func MaskSubRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskSubRoundSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] - b[31:0]
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VSUBSS'. Intrinsic: '_mm_mask_sub_round_ss'. Requires AVX512F.

func MaskSubSd

func MaskSubSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskSubSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] - b[63:0]
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VSUBSD'. Intrinsic: '_mm_mask_sub_sd'. Requires AVX512F.

func MaskSubSs

func MaskSubSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskSubSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] - b[31:0]
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VSUBSS'. Intrinsic: '_mm_mask_sub_ss'. Requires AVX512F.

func MaskTernarylogicEpi32

func MaskTernarylogicEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

MaskTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 32-bit granularity (32-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm_mask_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskTernarylogicEpi64

func MaskTernarylogicEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

MaskTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'src', 'a', and 'b' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using writemask 'k' at 64-bit granularity (64-bit elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm_mask_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskTestEpi32Mask

func MaskTestEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTMD'. Intrinsic: '_mm_mask_test_epi32_mask'. Requires AVX512F.

func MaskTestEpi64Mask

func MaskTestEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm_mask_test_epi64_mask'. Requires AVX512F.

func MaskTestnEpi32Mask

func MaskTestnEpi32Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 3
	i := j*32
	IF k1[j]
		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm_mask_testn_epi32_mask'. Requires AVX512F.

func MaskTestnEpi64Mask

func MaskTestnEpi64Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 1
	i := j*64
	IF k1[j]
		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm_mask_testn_epi64_mask'. Requires AVX512F.

func MaskUnpackhiEpi32

func MaskUnpackhiEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm_mask_unpackhi_epi32'. Requires AVX512F.

func MaskUnpackhiEpi64

func MaskUnpackhiEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm_mask_unpackhi_epi64'. Requires AVX512F.

func MaskUnpackhiPd

func MaskUnpackhiPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm_mask_unpackhi_pd'. Requires AVX512F.

func MaskUnpackhiPs

func MaskUnpackhiPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm_mask_unpackhi_ps'. Requires AVX512F.

func MaskUnpackloEpi32

func MaskUnpackloEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm_mask_unpacklo_epi32'. Requires AVX512F.

func MaskUnpackloEpi64

func MaskUnpackloEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm_mask_unpacklo_epi64'. Requires AVX512F.

func MaskUnpackloPd

func MaskUnpackloPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm_mask_unpacklo_pd'. Requires AVX512F.

func MaskUnpackloPs

func MaskUnpackloPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm_mask_unpacklo_ps'. Requires AVX512F.

func MaskXorEpi32

func MaskXorEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm_mask_xor_epi32'. Requires AVX512F.

func MaskXorEpi64

func MaskXorEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm_mask_xor_epi64'. Requires AVX512F.

func MaskzAbsEpi32

func MaskzAbsEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzAbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ABS(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm_maskz_abs_epi32'. Requires AVX512F.

func MaskzAbsEpi64

func MaskzAbsEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzAbsEpi64: Compute the absolute value of packed 64-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ABS(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSQ'. Intrinsic: '_mm_maskz_abs_epi64'. Requires AVX512F.

func MaskzAddEpi32

func MaskzAddEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm_maskz_add_epi32'. Requires AVX512F.

func MaskzAddEpi64

func MaskzAddEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm_maskz_add_epi64'. Requires AVX512F.

func MaskzAddRoundSd

func MaskzAddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzAddRoundSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] + b[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VADDSD'. Intrinsic: '_mm_maskz_add_round_sd'. Requires AVX512F.

func MaskzAddRoundSs

func MaskzAddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzAddRoundSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] + b[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VADDSS'. Intrinsic: '_mm_maskz_add_round_ss'. Requires AVX512F.

func MaskzAddSd

func MaskzAddSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzAddSd: Add the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] + b[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VADDSD'. Intrinsic: '_mm_maskz_add_sd'. Requires AVX512F.

func MaskzAddSs

func MaskzAddSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzAddSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] + b[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VADDSS'. Intrinsic: '_mm_maskz_add_ss'. Requires AVX512F.

func MaskzAndEpi32

func MaskzAndEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAndEpi32: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDD'. Intrinsic: '_mm_maskz_and_epi32'. Requires AVX512F.

func MaskzAndEpi64

func MaskzAndEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAndEpi64: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDQ'. Intrinsic: '_mm_maskz_and_epi64'. Requires AVX512F.

func MaskzAndnotEpi32

func MaskzAndnotEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAndnotEpi32: Compute the bitwise AND NOT of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDND'. Intrinsic: '_mm_maskz_andnot_epi32'. Requires AVX512F.

func MaskzAndnotEpi64

func MaskzAndnotEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAndnotEpi64: Compute the bitwise AND NOT of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPANDNQ'. Intrinsic: '_mm_maskz_andnot_epi64'. Requires AVX512F.

func MaskzBroadcastdEpi32

func MaskzBroadcastdEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzBroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_maskz_broadcastd_epi32'. Requires AVX512F.

func MaskzBroadcastqEpi64

func MaskzBroadcastqEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzBroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_maskz_broadcastq_epi64'. Requires AVX512F.

func MaskzBroadcastssPs

func MaskzBroadcastssPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzBroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm_maskz_broadcastss_ps'. Requires AVX512F.

func MaskzCompressEpi32

func MaskzCompressEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCompressEpi32: Contiguously store the active 32-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0

Instruction: 'VPCOMPRESSD'. Intrinsic: '_mm_maskz_compress_epi32'. Requires AVX512F.

func MaskzCompressEpi64

func MaskzCompressEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCompressEpi64: Contiguously store the active 64-bit integers in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0

Instruction: 'VPCOMPRESSQ'. Intrinsic: '_mm_maskz_compress_epi64'. Requires AVX512F.

func MaskzCompressPd

func MaskzCompressPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzCompressPd: Contiguously store the active double-precision (64-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 64
m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[m+size-1:m] := a[i+63:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0

Instruction: 'VCOMPRESSPD'. Intrinsic: '_mm_maskz_compress_pd'. Requires AVX512F.

func MaskzCompressPs

func MaskzCompressPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzCompressPs: Contiguously store the active single-precision (32-bit) floating-point elements in 'a' (those with their respective bit set in zeromask 'k') to 'dst', and set the remaining elements to zero.

size := 32
m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[m+size-1:m] := a[i+31:i]
		m := m + size
	FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0

Instruction: 'VCOMPRESSPS'. Intrinsic: '_mm_maskz_compress_ps'. Requires AVX512F.

func MaskzCvtRoundpsPh

func MaskzCvtRoundpsPh(k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)

MaskzCvtRoundpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 3
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:64] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_maskz_cvt_roundps_ph'. Requires AVX512F.

func MaskzCvtRoundsdSs

func MaskzCvtRoundsdSs(k x86.Mmask8, a x86.M128, b x86.M128d, rounding int) (dst x86.M128)

MaskzCvtRoundsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := Convert_FP64_To_FP32(b[63:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:31]
		dst[MAX:64] := 0

Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_maskz_cvt_roundsd_ss'. Requires AVX512F.

func MaskzCvtRoundssSd

func MaskzCvtRoundssSd(k x86.Mmask8, a x86.M128d, b x86.M128, rounding int) (dst x86.M128d)

MaskzCvtRoundssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := Convert_FP32_To_FP64(b[31:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:64] := 0

Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_maskz_cvt_roundss_sd'. Requires AVX512F.

func MaskzCvtepi16Epi32

func MaskzCvtepi16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm_maskz_cvtepi16_epi32'. Requires AVX512F.

func MaskzCvtepi16Epi64

func MaskzCvtepi16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi16Epi64: Sign extend packed 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm_maskz_cvtepi16_epi64'. Requires AVX512F.

func MaskzCvtepi32Epi16

func MaskzCvtepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVDW'. Intrinsic: '_mm_maskz_cvtepi32_epi16'. Requires AVX512F.

func MaskzCvtepi32Epi64

func MaskzCvtepi32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm_maskz_cvtepi32_epi64'. Requires AVX512F.

func MaskzCvtepi32Epi8

func MaskzCvtepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVDB'. Intrinsic: '_mm_maskz_cvtepi32_epi8'. Requires AVX512F.

func MaskzCvtepi32Pd

func MaskzCvtepi32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskzCvtepi32Pd: Convert packed 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*32
	m := j*64
	IF k[j]
		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
	ELSE
		dst[m+63:m] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTDQ2PD'. Intrinsic: '_mm_maskz_cvtepi32_pd'. Requires AVX512F.

func MaskzCvtepi32Ps

func MaskzCvtepi32Ps(k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskzCvtepi32Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTDQ2PS'. Intrinsic: '_mm_maskz_cvtepi32_ps'. Requires AVX512F.

func MaskzCvtepi64Epi16

func MaskzCvtepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVQW'. Intrinsic: '_mm_maskz_cvtepi64_epi16'. Requires AVX512F.

func MaskzCvtepi64Epi32

func MaskzCvtepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVQD'. Intrinsic: '_mm_maskz_cvtepi64_epi32'. Requires AVX512F.

func MaskzCvtepi64Epi8

func MaskzCvtepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVQB'. Intrinsic: '_mm_maskz_cvtepi64_epi8'. Requires AVX512F.

func MaskzCvtepi8Epi32

func MaskzCvtepi8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi8Epi32: Sign extend packed 8-bit integers in the low 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm_maskz_cvtepi8_epi32'. Requires AVX512F.

func MaskzCvtepi8Epi64

func MaskzCvtepi8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi8Epi64: Sign extend packed 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := SignExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm_maskz_cvtepi8_epi64'. Requires AVX512F.

func MaskzCvtepu16Epi32

func MaskzCvtepu16Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm_maskz_cvtepu16_epi32'. Requires AVX512F.

func MaskzCvtepu16Epi64

func MaskzCvtepu16Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu16Epi64: Zero extend packed unsigned 16-bit integers in the low 4 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+15:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm_maskz_cvtepu16_epi64'. Requires AVX512F.

func MaskzCvtepu32Epi64

func MaskzCvtepu32Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm_maskz_cvtepu32_epi64'. Requires AVX512F.

func MaskzCvtepu32Pd

func MaskzCvtepu32Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskzCvtepu32Pd: Convert packed unsigned 32-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUDQ2PD'. Intrinsic: '_mm_maskz_cvtepu32_pd'. Requires AVX512F.

func MaskzCvtepu8Epi32

func MaskzCvtepu8Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu8Epi32: Zero extend packed unsigned 8-bit integers in th elow 4 bytes of 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm_maskz_cvtepu8_epi32'. Requires AVX512F.

func MaskzCvtepu8Epi64

func MaskzCvtepu8Epi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 2 bytes of 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[l+7:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm_maskz_cvtepu8_epi64'. Requires AVX512F.

func MaskzCvtpdEpi32

func MaskzCvtpdEpi32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvtpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2DQ'. Intrinsic: '_mm_maskz_cvtpd_epi32'. Requires AVX512F.

func MaskzCvtpdEpu32

func MaskzCvtpdEpu32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvtpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2UDQ'. Intrinsic: '_mm_maskz_cvtpd_epu32'. Requires AVX512F.

func MaskzCvtpdPs

func MaskzCvtpdPs(k x86.Mmask8, a x86.M128d) (dst x86.M128)

MaskzCvtpdPs: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*32
	l := j*64
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTPD2PS'. Intrinsic: '_mm_maskz_cvtpd_ps'. Requires AVX512F.

func MaskzCvtphPs

func MaskzCvtphPs(k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskzCvtphPs: Convert packed half-precision (16-bit) floating-point elements in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	m := j*16
	IF k[j]
		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPH2PS'. Intrinsic: '_mm_maskz_cvtph_ps'. Requires AVX512F.

func MaskzCvtpsEpi32

func MaskzCvtpsEpi32(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvtpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2DQ'. Intrinsic: '_mm_maskz_cvtps_epi32'. Requires AVX512F.

func MaskzCvtpsEpu32

func MaskzCvtpsEpu32(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvtpsEpu32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UDQ'. Intrinsic: '_mm_maskz_cvtps_epu32'. Requires AVX512F.

func MaskzCvtpsPh

func MaskzCvtpsPh(k x86.Mmask8, a x86.M128, rounding int) (dst x86.M128i)

MaskzCvtpsPh: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed half-precision (16-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 3
			i := 16*j
			l := 32*j
			IF k[j]
				dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
			ELSE
				dst[i+15:i] := 0
			FI
		ENDFOR
		dst[MAX:64] := 0

Instruction: 'VCVTPS2PH'. Intrinsic: '_mm_maskz_cvtps_ph'. Requires AVX512F.

func MaskzCvtsdSs

func MaskzCvtsdSs(k x86.Mmask8, a x86.M128, b x86.M128d) (dst x86.M128)

MaskzCvtsdSs: Convert the lower double-precision (64-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:31]
dst[MAX:64] := 0

Instruction: 'VCVTSD2SS'. Intrinsic: '_mm_maskz_cvtsd_ss'. Requires AVX512F.

func MaskzCvtsepi32Epi16

func MaskzCvtsepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi32Epi16: Convert packed 32-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSDW'. Intrinsic: '_mm_maskz_cvtsepi32_epi16'. Requires AVX512F.

func MaskzCvtsepi32Epi8

func MaskzCvtsepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi32Epi8: Convert packed 32-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSDB'. Intrinsic: '_mm_maskz_cvtsepi32_epi8'. Requires AVX512F.

func MaskzCvtsepi64Epi16

func MaskzCvtsepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi64Epi16: Convert packed 64-bit integers in 'a' to packed 16-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVSQW'. Intrinsic: '_mm_maskz_cvtsepi64_epi16'. Requires AVX512F.

func MaskzCvtsepi64Epi32

func MaskzCvtsepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi64Epi32: Convert packed 64-bit integers in 'a' to packed 32-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSQD'. Intrinsic: '_mm_maskz_cvtsepi64_epi32'. Requires AVX512F.

func MaskzCvtsepi64Epi8

func MaskzCvtsepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi64Epi8: Convert packed 64-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVSQB'. Intrinsic: '_mm_maskz_cvtsepi64_epi8'. Requires AVX512F.

func MaskzCvtssSd

func MaskzCvtssSd(k x86.Mmask8, a x86.M128d, b x86.M128) (dst x86.M128d)

MaskzCvtssSd: Convert the lower single-precision (32-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point element, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:64] := 0

Instruction: 'VCVTSS2SD'. Intrinsic: '_mm_maskz_cvtss_sd'. Requires AVX512F.

func MaskzCvttpdEpi32

func MaskzCvttpdEpi32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvttpdEpi32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTTPD2DQ'. Intrinsic: '_mm_maskz_cvttpd_epi32'. Requires AVX512F.

func MaskzCvttpdEpu32

func MaskzCvttpdEpu32(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvttpdEpu32: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 32*j
	l := 64*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTTPD2UDQ'. Intrinsic: '_mm_maskz_cvttpd_epu32'. Requires AVX512F.

func MaskzCvttpsEpi32

func MaskzCvttpsEpi32(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvttpsEpi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*i
	IF k[j]
		dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2DQ'. Intrinsic: '_mm_maskz_cvttps_epi32'. Requires AVX512F.

func MaskzCvttpsEpu32

func MaskzCvttpsEpu32(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvttpsEpu32: Convert packed double-precision (32-bit) floating-point elements in 'a' to packed unsigned 32-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UDQ'. Intrinsic: '_mm_maskz_cvttps_epu32'. Requires AVX512F.

func MaskzCvtusepi32Epi16

func MaskzCvtusepi32Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi32Epi16: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSDW'. Intrinsic: '_mm_maskz_cvtusepi32_epi16'. Requires AVX512F.

func MaskzCvtusepi32Epi8

func MaskzCvtusepi32Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi32Epi8: Convert packed unsigned 32-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSDB'. Intrinsic: '_mm_maskz_cvtusepi32_epi8'. Requires AVX512F.

func MaskzCvtusepi64Epi16

func MaskzCvtusepi64Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi64Epi16: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 16-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 16*j
	IF k[j]
		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:32] := 0

Instruction: 'VPMOVUSQW'. Intrinsic: '_mm_maskz_cvtusepi64_epi16'. Requires AVX512F.

func MaskzCvtusepi64Epi32

func MaskzCvtusepi64Epi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi64Epi32: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 32-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 32*j
	IF k[j]
		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSQD'. Intrinsic: '_mm_maskz_cvtusepi64_epi32'. Requires AVX512F.

func MaskzCvtusepi64Epi8

func MaskzCvtusepi64Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi64Epi8: Convert packed unsigned 64-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:16] := 0

Instruction: 'VPMOVUSQB'. Intrinsic: '_mm_maskz_cvtusepi64_epi8'. Requires AVX512F.

func MaskzDivPd

func MaskzDivPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzDivPd: Divide packed double-precision (64-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := 64*j
	IF k[j]
		dst[i+63:i] := a[i+63:i] / b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDIVPD'. Intrinsic: '_mm_maskz_div_pd'. Requires AVX512F.

func MaskzDivPs

func MaskzDivPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzDivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := 32*j
	IF k[j]
		dst[i+31:i] := a[i+31:i] / b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDIVPS'. Intrinsic: '_mm_maskz_div_ps'. Requires AVX512F.

func MaskzDivRoundSd

func MaskzDivRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzDivRoundSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] / b[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VDIVSD'. Intrinsic: '_mm_maskz_div_round_sd'. Requires AVX512F.

func MaskzDivRoundSs

func MaskzDivRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzDivRoundSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] / b[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VDIVSS'. Intrinsic: '_mm_maskz_div_round_ss'. Requires AVX512F.

func MaskzDivSd

func MaskzDivSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzDivSd: Divide the lower double-precision (64-bit) floating-point element in 'a' by the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] / b[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VDIVSD'. Intrinsic: '_mm_maskz_div_sd'. Requires AVX512F.

func MaskzDivSs

func MaskzDivSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzDivSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] / b[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VDIVSS'. Intrinsic: '_mm_maskz_div_ss'. Requires AVX512F.

func MaskzExpandEpi32

func MaskzExpandEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzExpandEpi32: Load contiguous active 32-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPEXPANDD'. Intrinsic: '_mm_maskz_expand_epi32'. Requires AVX512F.

func MaskzExpandEpi64

func MaskzExpandEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzExpandEpi64: Load contiguous active 64-bit integers from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPEXPANDQ'. Intrinsic: '_mm_maskz_expand_epi64'. Requires AVX512F.

func MaskzExpandPd

func MaskzExpandPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzExpandPd: Load contiguous active double-precision (64-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[m+63:m]
		m := m + 64
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXPANDPD'. Intrinsic: '_mm_maskz_expand_pd'. Requires AVX512F.

func MaskzExpandPs

func MaskzExpandPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzExpandPs: Load contiguous active single-precision (32-bit) floating-point elements from 'a' (those with their respective bit set in mask 'k'), and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

m := 0
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[m+31:m]
		m := m + 32
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXPANDPS'. Intrinsic: '_mm_maskz_expand_ps'. Requires AVX512F.

func MaskzFixupimmPd

func MaskzFixupimmPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

MaskzFixupimmPd: Fix up packed double-precision (64-bit) floating-point elements in 'a' and 'b' using packed 64-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPD'. Intrinsic: '_mm_maskz_fixupimm_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFixupimmPs

func MaskzFixupimmPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

MaskzFixupimmPs: Fix up packed single-precision (32-bit) floating-point elements in 'a' and 'b' using packed 32-bit integers in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMPS'. Intrinsic: '_mm_maskz_fixupimm_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFixupimmRoundSd

func MaskzFixupimmRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte, rounding int) (dst x86.M128d)

MaskzFixupimmRoundSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN := 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
			tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
			CASE(tsrc[63:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[63:0] := src1[63:0]
			1 : dest[63:0] := tsrc[63:0]
			2 : dest[63:0] := QNaN(tsrc[63:0])
			3 : dest[63:0] := QNAN_Indefinite
			4 : dest[63:0] := -INF
			5 : dest[63:0] := +INF
			6 : dest[63:0] := tsrc.sign? –INF : +INF
			7 : dest[63:0] := -0
			8 : dest[63:0] := +0
			9 : dest[63:0] := -1
			10: dest[63:0] := +1
			11: dest[63:0] := 1⁄2
			12: dest[63:0] := 90.0
			13: dest[63:0] := PI/2
			14: dest[63:0] := MAX_FLOAT
			15: dest[63:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[63:0]
		}

		IF k[0]
			dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_maskz_fixupimm_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFixupimmRoundSs

func MaskzFixupimmRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte, rounding int) (dst x86.M128)

MaskzFixupimmRoundSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		enum TOKEN_TYPE {
			QNAN_TOKEN := 0,
			SNAN_TOKEN L= 1,
			ZERO_VALUE_TOKEN := 2,
			ONE_VALUE_TOKEN := 3,
			NEG_INF_TOKEN := 4,
			POS_INF_TOKEN := 5,
			NEG_VALUE_TOKEN := 6,
			POS_VALUE_TOKEN := 7
		}
		FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
			tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
			CASE(tsrc[31:0] of TOKEN_TYPE)
			QNAN_TOKEN:j := 0
			SNAN_TOKEN:j := 1
			ZERO_VALUE_TOKEN: j := 2
			ONE_VALUE_TOKEN: j := 3
			NEG_INF_TOKEN: j := 4
			POS_INF_TOKEN: j := 5
			NEG_VALUE_TOKEN: j := 6
			POS_VALUE_TOKEN: j := 7
			ESAC

			token_response[3:0] := src3[3+4*j:4*j]

			CASE(token_response[3:0]) of
			0 : dest[31:0] := src1[31:0]
			1 : dest[31:0] := tsrc[31:0]
			2 : dest[31:0] := QNaN(tsrc[31:0])
			3 : dest[31:0] := QNAN_Indefinite
			4 : dest[31:0] := -INF
			5 : dest[31:0] := +INF
			6 : dest[31:0] := tsrc.sign? –INF : +INF
			7 : dest[31:0] := -0
			8 : dest[31:0] := +0
			9 : dest[31:0] := -1
			10: dest[31:0] := +1
			11: dest[31:0] := 1⁄2
			12: dest[31:0] := 90.0
			13: dest[31:0] := PI/2
			14: dest[31:0] := MAX_FLOAT
			15: dest[31:0] := -MAX_FLOAT
			ESAC

			CASE(tsrc[31:0] of TOKEN_TYPE)
			ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
			ZERO_VALUE_TOKEN: if imm8[1] then set #IE
			ONE_VALUE_TOKEN: if imm8[2] then set #ZE
			ONE_VALUE_TOKEN: if imm8[3] then set #IE
			SNAN_TOKEN: if imm8[4] then set #IE
			NEG_INF_TOKEN: if imm8[5] then set #IE
			NEG_VALUE_TOKEN: if imm8[6] then set #IE
			POS_INF_TOKEN: if imm8[7] then set #IE
			ESAC
			RETURN dest[31:0]
		}

		IF k[0]
			dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_maskz_fixupimm_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFixupimmSd

func MaskzFixupimmSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128i, imm8 byte) (dst x86.M128d)

MaskzFixupimmSd: Fix up the lower double-precision (64-bit) floating-point elements in 'a' and 'b' using the lower 64-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN := 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
	CASE(tsrc[63:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[63:0] := src1[63:0]
	1 : dest[63:0] := tsrc[63:0]
	2 : dest[63:0] := QNaN(tsrc[63:0])
	3 : dest[63:0] := QNAN_Indefinite
	4 : dest[63:0] := -INF
	5 : dest[63:0] := +INF
	6 : dest[63:0] := tsrc.sign? –INF : +INF
	7 : dest[63:0] := -0
	8 : dest[63:0] := +0
	9 : dest[63:0] := -1
	10: dest[63:0] := +1
	11: dest[63:0] := 1⁄2
	12: dest[63:0] := 90.0
	13: dest[63:0] := PI/2
	14: dest[63:0] := MAX_FLOAT
	15: dest[63:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[63:0]
}

IF k[0]
	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSD'. Intrinsic: '_mm_maskz_fixupimm_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFixupimmSs

func MaskzFixupimmSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128i, imm8 byte) (dst x86.M128)

MaskzFixupimmSs: Fix up the lower single-precision (32-bit) floating-point elements in 'a' and 'b' using the lower 32-bit integer in 'c', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. 'imm8' is used to set the required flags reporting.

enum TOKEN_TYPE {
	QNAN_TOKEN := 0,
	SNAN_TOKEN L= 1,
	ZERO_VALUE_TOKEN := 2,
	ONE_VALUE_TOKEN := 3,
	NEG_INF_TOKEN := 4,
	POS_INF_TOKEN := 5,
	NEG_VALUE_TOKEN := 6,
	POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
	CASE(tsrc[31:0] of TOKEN_TYPE)
	QNAN_TOKEN:j := 0
	SNAN_TOKEN:j := 1
	ZERO_VALUE_TOKEN: j := 2
	ONE_VALUE_TOKEN: j := 3
	NEG_INF_TOKEN: j := 4
	POS_INF_TOKEN: j := 5
	NEG_VALUE_TOKEN: j := 6
	POS_VALUE_TOKEN: j := 7
	ESAC

	token_response[3:0] := src3[3+4*j:4*j]

	CASE(token_response[3:0]) of
	0 : dest[31:0] := src1[31:0]
	1 : dest[31:0] := tsrc[31:0]
	2 : dest[31:0] := QNaN(tsrc[31:0])
	3 : dest[31:0] := QNAN_Indefinite
	4 : dest[31:0] := -INF
	5 : dest[31:0] := +INF
	6 : dest[31:0] := tsrc.sign? –INF : +INF
	7 : dest[31:0] := -0
	8 : dest[31:0] := +0
	9 : dest[31:0] := -1
	10: dest[31:0] := +1
	11: dest[31:0] := 1⁄2
	12: dest[31:0] := 90.0
	13: dest[31:0] := PI/2
	14: dest[31:0] := MAX_FLOAT
	15: dest[31:0] := -MAX_FLOAT
	ESAC

	CASE(tsrc[31:0] of TOKEN_TYPE)
	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
	ONE_VALUE_TOKEN: if imm8[3] then set #IE
	SNAN_TOKEN: if imm8[4] then set #IE
	NEG_INF_TOKEN: if imm8[5] then set #IE
	NEG_VALUE_TOKEN: if imm8[6] then set #IE
	POS_INF_TOKEN: if imm8[7] then set #IE
	ESAC
	RETURN dest[31:0]
}

IF k[0]
	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFIXUPIMMSS'. Intrinsic: '_mm_maskz_fixupimm_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzFmaddPd

func MaskzFmaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PD, VFMADD213PD, VFMADD231PD'. Intrinsic: '_mm_maskz_fmadd_pd'. Requires AVX512F.

func MaskzFmaddPs

func MaskzFmaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADD132PS, VFMADD213PS, VFMADD231PS'. Intrinsic: '_mm_maskz_fmadd_ps'. Requires AVX512F.

func MaskzFmaddRoundSd

func MaskzFmaddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskzFmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_maskz_fmadd_round_sd'. Requires AVX512F.

func MaskzFmaddRoundSs

func MaskzFmaddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskzFmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_maskz_fmadd_round_ss'. Requires AVX512F.

func MaskzFmaddSd

func MaskzFmaddSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMADD132SD, VFMADD213SD, VFMADD231SD'. Intrinsic: '_mm_maskz_fmadd_sd'. Requires AVX512F.

func MaskzFmaddSs

func MaskzFmaddSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMADD132SS, VFMADD213SS, VFMADD231SS'. Intrinsic: '_mm_maskz_fmadd_ss'. Requires AVX512F.

func MaskzFmaddsubPd

func MaskzFmaddsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmaddsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PD, VFMADDSUB213PD, VFMADDSUB231PD'. Intrinsic: '_mm_maskz_fmaddsub_pd'. Requires AVX512F.

func MaskzFmaddsubPs

func MaskzFmaddsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmaddsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively add and subtract packed elements in 'c' to/from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMADDSUB132PS, VFMADDSUB213PS, VFMADDSUB231PS'. Intrinsic: '_mm_maskz_fmaddsub_ps'. Requires AVX512F.

func MaskzFmsubPd

func MaskzFmsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PD, VFMSUB213PD, VFMSUB231PD'. Intrinsic: '_mm_maskz_fmsub_pd'. Requires AVX512F.

func MaskzFmsubPs

func MaskzFmsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUB132PS, VFMSUB213PS, VFMSUB231PS'. Intrinsic: '_mm_maskz_fmsub_ps'. Requires AVX512F.

func MaskzFmsubRoundSd

func MaskzFmsubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskzFmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_maskz_fmsub_round_sd'. Requires AVX512F.

func MaskzFmsubRoundSs

func MaskzFmsubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskzFmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_maskz_fmsub_round_ss'. Requires AVX512F.

func MaskzFmsubSd

func MaskzFmsubSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SD, VFMSUB213SD, VFMSUB231SD'. Intrinsic: '_mm_maskz_fmsub_sd'. Requires AVX512F.

func MaskzFmsubSs

func MaskzFmsubSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFMSUB132SS, VFMSUB213SS, VFMSUB231SS'. Intrinsic: '_mm_maskz_fmsub_ss'. Requires AVX512F.

func MaskzFmsubaddPd

func MaskzFmsubaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFmsubaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF (j is even)
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
		ELSE
			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PD, VFMSUBADD213PD, VFMSUBADD231PD'. Intrinsic: '_mm_maskz_fmsubadd_pd'. Requires AVX512F.

func MaskzFmsubaddPs

func MaskzFmsubaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFmsubaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', alternatively subtract and add packed elements in 'c' from/to the intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF (j is even)
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
		ELSE
			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFMSUBADD132PS, VFMSUBADD213PS, VFMSUBADD231PS'. Intrinsic: '_mm_maskz_fmsubadd_ps'. Requires AVX512F.

func MaskzFnmaddPd

func MaskzFnmaddPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFnmaddPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PD, VFNMADD213PD, VFNMADD231PD'. Intrinsic: '_mm_maskz_fnmadd_pd'. Requires AVX512F.

func MaskzFnmaddPs

func MaskzFnmaddPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFnmaddPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', add the negated intermediate result to packed elements in 'c', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMADD132PS, VFNMADD213PS, VFNMADD231PS'. Intrinsic: '_mm_maskz_fnmadd_ps'. Requires AVX512F.

func MaskzFnmaddRoundSd

func MaskzFnmaddRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskzFnmaddRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SD, VFNMADD213SD, VFNMADD231SD'. Intrinsic: '_mm_maskz_fnmadd_round_sd'. Requires AVX512F.

func MaskzFnmaddRoundSs

func MaskzFnmaddRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskzFnmaddRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_maskz_fnmadd_round_ss'. Requires AVX512F.

func MaskzFnmaddSd

func MaskzFnmaddSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFnmaddSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMADD213SD, VFNMADD231SD, VFNMADD132SD'. Intrinsic: '_mm_maskz_fnmadd_sd'. Requires AVX512F.

func MaskzFnmaddSs

func MaskzFnmaddSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFnmaddSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and add the negated intermediate result to the lower element in 'c'. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMADD132SS, VFNMADD213SS, VFNMADD231SS'. Intrinsic: '_mm_maskz_fnmadd_ss'. Requires AVX512F.

func MaskzFnmsubPd

func MaskzFnmsubPd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFnmsubPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PD, VFNMSUB213PD, VFNMSUB231PD'. Intrinsic: '_mm_maskz_fnmsub_pd'. Requires AVX512F.

func MaskzFnmsubPs

func MaskzFnmsubPs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFnmsubPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', subtract packed elements in 'c' from the negated intermediate result, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VFNMSUB132PS, VFNMSUB213PS, VFNMSUB231PS'. Intrinsic: '_mm_maskz_fnmsub_ps'. Requires AVX512F.

func MaskzFnmsubRoundSd

func MaskzFnmsubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d, rounding int) (dst x86.M128d)

MaskzFnmsubRoundSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_maskz_fnmsub_round_sd'. Requires AVX512F.

func MaskzFnmsubRoundSs

func MaskzFnmsubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128, rounding int) (dst x86.M128)

MaskzFnmsubRoundSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_maskz_fnmsub_round_ss'. Requires AVX512F.

func MaskzFnmsubSd

func MaskzFnmsubSd(k x86.Mmask8, a x86.M128d, b x86.M128d, c x86.M128d) (dst x86.M128d)

MaskzFnmsubSd: Multiply the lower double-precision (64-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SD, VFNMSUB213SD, VFNMSUB231SD'. Intrinsic: '_mm_maskz_fnmsub_sd'. Requires AVX512F.

func MaskzFnmsubSs

func MaskzFnmsubSs(k x86.Mmask8, a x86.M128, b x86.M128, c x86.M128) (dst x86.M128)

MaskzFnmsubSs: Multiply the lower single-precision (32-bit) floating-point elements in 'a' and 'b', and subtract the lower element in 'c' from the negated intermediate result. Store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VFNMSUB132SS, VFNMSUB213SS, VFNMSUB231SS'. Intrinsic: '_mm_maskz_fnmsub_ss'. Requires AVX512F.

func MaskzGetexpPd

func MaskzGetexpPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzGetexpPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPD'. Intrinsic: '_mm_maskz_getexp_pd'. Requires AVX512F.

func MaskzGetexpPs

func MaskzGetexpPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzGetexpPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VGETEXPPS'. Intrinsic: '_mm_maskz_getexp_ps'. Requires AVX512F.

func MaskzGetexpRoundSd

func MaskzGetexpRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzGetexpRoundSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := ConvertExpFP64(b[63:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_maskz_getexp_round_sd'. Requires AVX512F.

func MaskzGetexpRoundSs

func MaskzGetexpRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzGetexpRoundSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := ConvertExpFP32(b[31:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_maskz_getexp_round_ss'. Requires AVX512F.

func MaskzGetexpSd

func MaskzGetexpSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzGetexpSd: Convert the exponent of the lower double-precision (64-bit) floating-point element in 'b' to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

IF k[0]
	dst[63:0] := ConvertExpFP64(b[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VGETEXPSD'. Intrinsic: '_mm_maskz_getexp_sd'. Requires AVX512F.

func MaskzGetexpSs

func MaskzGetexpSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzGetexpSs: Convert the exponent of the lower single-precision (32-bit) floating-point element in 'b' to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for the lower element.

IF k[0]
	dst[31:0] := ConvertExpFP32(b[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VGETEXPSS'. Intrinsic: '_mm_maskz_getexp_ss'. Requires AVX512F.

func MaskzGetmantPd

func MaskzGetmantPd(k x86.Mmask8, a x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

MaskzGetmantPd: Normalize the mantissas of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 1
			i := j*64
			IF k[j]
				dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPD'. Intrinsic: '_mm_maskz_getmant_pd'. Requires AVX512F.

func MaskzGetmantPs

func MaskzGetmantPs(k x86.Mmask8, a x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

MaskzGetmantPs: Normalize the mantissas of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		FOR j := 0 to 3
			i := j*32
			IF k[j]
				dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:128] := 0

Instruction: 'VGETMANTPS'. Intrinsic: '_mm_maskz_getmant_ps'. Requires AVX512F.

func MaskzGetmantRoundSd

func MaskzGetmantRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128d)

MaskzGetmantRoundSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_maskz_getmant_round_sd'. Requires AVX512F.

func MaskzGetmantRoundSs

func MaskzGetmantRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM, rounding int) (dst x86.M128)

MaskzGetmantRoundSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1Rounding is done according to the 'rounding' parameter, which can be one of:
    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_maskz_getmant_round_ss'. Requires AVX512F.

func MaskzGetmantSd

func MaskzGetmantSd(k x86.Mmask8, a x86.M128d, b x86.M128d, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128d)

MaskzGetmantSd: Normalize the mantissas of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		IF k[0]
			dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSD'. Intrinsic: '_mm_maskz_getmant_sd'. Requires AVX512F.

func MaskzGetmantSs

func MaskzGetmantSs(k x86.Mmask8, a x86.M128, b x86.M128, interv MMMANTISSANORMENUM, sc MMMANTISSASIGNENUM) (dst x86.M128)

MaskzGetmantSs: Normalize the mantissas of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'. This intrinsic essentially calculates '±(2^k)*|x.significand|', where 'k' depends on the interval range defined by 'interv' and the sign depends on 'sc' and the source sign.

The mantissa is normalized to the interval specified by 'interv', which can

take the following values:

    _MM_MANT_NORM_1_2     // interval [1, 2)
    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)The sign is determined by 'sc' which can take the following values:
    _MM_MANT_SIGN_src     // sign = sign(src)
    _MM_MANT_SIGN_zero    // sign = 0
    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1

		IF k[0]
			dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VGETMANTSS'. Intrinsic: '_mm_maskz_getmant_ss'. Requires AVX512F.

func MaskzLoadSd

func MaskzLoadSd(k x86.Mmask8, mem_addr *float64) (dst x86.M128d)

MaskzLoadSd: Load a double-precision (64-bit) floating-point element from memory into the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and set the upper element of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

IF k[0]
	dst[63:0] := MEM[mem_addr+63:mem_addr]
ELSE
	dst[63:0] := 0
FI
dst[MAX:64] := 0

Instruction: 'VMOVSD'. Intrinsic: '_mm_maskz_load_sd'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskzLoadSs

func MaskzLoadSs(k x86.Mmask8, mem_addr *float32) (dst x86.M128)

MaskzLoadSs: Load a single-precision (32-bit) floating-point element from memory into the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and set the upper elements of 'dst' to zero. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

IF k[0]
	dst[31:0] := MEM[mem_addr+31:mem_addr]
ELSE
	dst[31:0] := 0
FI
dst[MAX:32] := 0

Instruction: 'VMOVSS'. Intrinsic: '_mm_maskz_load_ss'. Requires AVX512F.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskzMaxEpi32

func MaskzMaxEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm_maskz_max_epi32'. Requires AVX512F.

func MaskzMaxEpi64

func MaskzMaxEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm_maskz_max_epi64'. Requires AVX512F.

func MaskzMaxEpu32

func MaskzMaxEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] > b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm_maskz_max_epu32'. Requires AVX512F.

func MaskzMaxEpu64

func MaskzMaxEpu64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] > b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm_maskz_max_epu64'. Requires AVX512F.

func MaskzMaxPd

func MaskzMaxPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMaxPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMAXPD'. Intrinsic: '_mm_maskz_max_pd'. Requires AVX512F.

func MaskzMaxPs

func MaskzMaxPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMAXPS'. Intrinsic: '_mm_maskz_max_ps'. Requires AVX512F.

func MaskzMaxRoundSd

func MaskzMaxRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MaskzMaxRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[63:0] := MAX(a[63:0], b[63:0])
	ELSE
		dst[63:0] := 0
	FI
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMAXSD'. Intrinsic: '_mm_maskz_max_round_sd'. Requires AVX512F.

func MaskzMaxRoundSs

func MaskzMaxRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)

MaskzMaxRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[31:0] := MAX(a[31:0], b[31:0])
	ELSE
		dst[31:0] := 0
	FI
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMAXSS'. Intrinsic: '_mm_maskz_max_round_ss'. Requires AVX512F.

func MaskzMaxSd

func MaskzMaxSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMaxSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := MAX(a[63:0], b[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMAXSD'. Intrinsic: '_mm_maskz_max_sd'. Requires AVX512F.

func MaskzMaxSs

func MaskzMaxSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMaxSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[31:0] := MAX(a[31:0], b[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMAXSS'. Intrinsic: '_mm_maskz_max_ss'. Requires AVX512F.

func MaskzMinEpi32

func MaskzMinEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm_maskz_min_epi32'. Requires AVX512F.

func MaskzMinEpi64

func MaskzMinEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm_maskz_min_epi64'. Requires AVX512F.

func MaskzMinEpu32

func MaskzMinEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF a[i+31:i] < b[i+31:i]
			dst[i+31:i] := a[i+31:i]
		ELSE
			dst[i+31:i] := b[i+31:i]
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm_maskz_min_epu32'. Requires AVX512F.

func MaskzMinEpu64

func MaskzMinEpu64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF a[i+63:i] < b[i+63:i]
			dst[i+63:i] := a[i+63:i]
		ELSE
			dst[i+63:i] := b[i+63:i]
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm_maskz_min_epu64'. Requires AVX512F.

func MaskzMinPd

func MaskzMinPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMinPd: Compare packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMINPD'. Intrinsic: '_mm_maskz_min_pd'. Requires AVX512F.

func MaskzMinPs

func MaskzMinPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMINPS'. Intrinsic: '_mm_maskz_min_ps'. Requires AVX512F.

func MaskzMinRoundSd

func MaskzMinRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MaskzMinRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[63:0] := MIN(a[63:0], b[63:0])
	ELSE
		dst[63:0] := 0
	FI
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMINSD'. Intrinsic: '_mm_maskz_min_round_sd'. Requires AVX512F.

func MaskzMinRoundSs

func MaskzMinRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, sae int) (dst x86.M128)

MaskzMinRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	IF k[0]
		dst[31:0] := MIN(a[31:0], b[31:0])
	ELSE
		dst[31:0] := 0
	FI
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMINSS'. Intrinsic: '_mm_maskz_min_round_ss'. Requires AVX512F.

func MaskzMinSd

func MaskzMinSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMinSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := MIN(a[63:0], b[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMINSD'. Intrinsic: '_mm_maskz_min_sd'. Requires AVX512F.

func MaskzMinSs

func MaskzMinSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMinSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[31:0] := MIN(a[31:0], b[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMINSS'. Intrinsic: '_mm_maskz_min_ss'. Requires AVX512F.

func MaskzMovEpi32

func MaskzMovEpi32(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzMovEpi32: Move packed 32-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQA32'. Intrinsic: '_mm_maskz_mov_epi32'. Requires AVX512F.

func MaskzMovEpi64

func MaskzMovEpi64(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzMovEpi64: Move packed 64-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQA64'. Intrinsic: '_mm_maskz_mov_epi64'. Requires AVX512F.

func MaskzMovPd

func MaskzMovPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzMovPd: Move packed double-precision (64-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVAPD'. Intrinsic: '_mm_maskz_mov_pd'. Requires AVX512F.

func MaskzMovPs

func MaskzMovPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzMovPs: Move packed single-precision (32-bit) floating-point elements from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVAPS'. Intrinsic: '_mm_maskz_mov_ps'. Requires AVX512F.

func MaskzMoveSd

func MaskzMoveSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMoveSd: Move the lower double-precision (64-bit) floating-point element from 'b' to the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := b[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMOVSD'. Intrinsic: '_mm_maskz_move_sd'. Requires AVX512F.

func MaskzMoveSs

func MaskzMoveSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMoveSs: Move the lower single-precision (32-bit) floating-point element from 'b' to the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := b[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMOVSS'. Intrinsic: '_mm_maskz_move_ss'. Requires AVX512F.

func MaskzMovedupPd

func MaskzMovedupPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzMovedupPd: Duplicate even-indexed double-precision (64-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDDUP'. Intrinsic: '_mm_maskz_movedup_pd'. Requires AVX512F.

func MaskzMovehdupPs

func MaskzMovehdupPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzMovehdupPs: Duplicate odd-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVSHDUP'. Intrinsic: '_mm_maskz_movehdup_ps'. Requires AVX512F.

func MaskzMoveldupPs

func MaskzMoveldupPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzMoveldupPs: Duplicate even-indexed single-precision (32-bit) floating-point elements from 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVSLDUP'. Intrinsic: '_mm_maskz_moveldup_ps'. Requires AVX512F.

func MaskzMulEpi32

func MaskzMulEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm_maskz_mul_epi32'. Requires AVX512F.

func MaskzMulEpu32

func MaskzMulEpu32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm_maskz_mul_epu32'. Requires AVX512F.

func MaskzMulPd

func MaskzMulPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMulPd: Multiply packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] * b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMULPD'. Intrinsic: '_mm_maskz_mul_pd'. Requires AVX512F.

func MaskzMulPs

func MaskzMulPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] * b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMULPS'. Intrinsic: '_mm_maskz_mul_ps'. Requires AVX512F.

func MaskzMulRoundSd

func MaskzMulRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzMulRoundSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] * b[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VMULSD'. Intrinsic: '_mm_maskz_mul_round_sd'. Requires AVX512F.

func MaskzMulRoundSs

func MaskzMulRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzMulRoundSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] * b[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VMULSS'. Intrinsic: '_mm_maskz_mul_round_ss'. Requires AVX512F.

func MaskzMulSd

func MaskzMulSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzMulSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] * b[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VMULSD'. Intrinsic: '_mm_maskz_mul_sd'. Requires AVX512F.

func MaskzMulSs

func MaskzMulSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzMulSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] * b[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VMULSS'. Intrinsic: '_mm_maskz_mul_ss'. Requires AVX512F.

func MaskzMulloEpi32

func MaskzMulloEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		tmp[63:0] := a[i+31:i] * b[i+31:i]
		dst[i+31:i] := tmp[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm_maskz_mullo_epi32'. Requires AVX512F.

func MaskzOrEpi32

func MaskzOrEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzOrEpi32: Compute the bitwise OR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPORD'. Intrinsic: '_mm_maskz_or_epi32'. Requires AVX512F.

func MaskzOrEpi64

func MaskzOrEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzOrEpi64: Compute the bitwise OR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPORQ'. Intrinsic: '_mm_maskz_or_epi64'. Requires AVX512F.

func MaskzPermutePd

func MaskzPermutePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskzPermutePd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm_maskz_permute_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzPermutePs

func MaskzPermutePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskzPermutePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm_maskz_permute_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzPermutevarPd

func MaskzPermutevarPd(k x86.Mmask8, a x86.M128d, b x86.M128i) (dst x86.M128d)

MaskzPermutevarPd: Shuffle double-precision (64-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPD'. Intrinsic: '_mm_maskz_permutevar_pd'. Requires AVX512F.

func MaskzPermutevarPs

func MaskzPermutevarPs(k x86.Mmask8, a x86.M128, b x86.M128i) (dst x86.M128)

MaskzPermutevarPs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMILPS'. Intrinsic: '_mm_maskz_permutevar_ps'. Requires AVX512F.

func MaskzPermutex2varEpi32

func MaskzPermutex2varEpi32(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPermutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm_maskz_permutex2var_epi32'. Requires AVX512F.

func MaskzPermutex2varEpi64

func MaskzPermutex2varEpi64(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPermutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm_maskz_permutex2var_epi64'. Requires AVX512F.

func MaskzPermutex2varPd

func MaskzPermutex2varPd(k x86.Mmask8, a x86.M128d, idx x86.M128i, b x86.M128d) (dst x86.M128d)

MaskzPermutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	IF k[j]
		dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm_maskz_permutex2var_pd'. Requires AVX512F.

func MaskzPermutex2varPs

func MaskzPermutex2varPs(k x86.Mmask8, a x86.M128, idx x86.M128i, b x86.M128) (dst x86.M128)

MaskzPermutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	IF k[j]
		dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm_maskz_permutex2var_ps'. Requires AVX512F.

func MaskzRcp14Pd

func MaskzRcp14Pd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzRcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm_maskz_rcp14_pd'. Requires AVX512F.

func MaskzRcp14Ps

func MaskzRcp14Ps(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzRcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm_maskz_rcp14_ps'. Requires AVX512F.

func MaskzRcp14Sd

func MaskzRcp14Sd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzRcp14Sd: Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[63:0] := APPROXIMATE(1.0/b[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRCP14SD'. Intrinsic: '_mm_maskz_rcp14_sd'. Requires AVX512F.

func MaskzRcp14Ss

func MaskzRcp14Ss(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzRcp14Ss: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[31:0] := APPROXIMATE(1.0/b[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRCP14SS'. Intrinsic: '_mm_maskz_rcp14_ss'. Requires AVX512F.

func MaskzRolEpi32

func MaskzRolEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzRolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm_maskz_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRolEpi64

func MaskzRolEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzRolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm_maskz_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRolvEpi32

func MaskzRolvEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzRolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm_maskz_rolv_epi32'. Requires AVX512F.

func MaskzRolvEpi64

func MaskzRolvEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzRolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm_maskz_rolv_epi64'. Requires AVX512F.

func MaskzRorEpi32

func MaskzRorEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzRorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm_maskz_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRorEpi64

func MaskzRorEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzRorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm_maskz_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRorvEpi32

func MaskzRorvEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzRorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm_maskz_rorv_epi32'. Requires AVX512F.

func MaskzRorvEpi64

func MaskzRorvEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzRorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm_maskz_rorv_epi64'. Requires AVX512F.

func MaskzRoundscalePd

func MaskzRoundscalePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskzRoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm_maskz_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRoundscalePs

func MaskzRoundscalePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskzRoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm_maskz_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRoundscaleRoundSd

func MaskzRoundscaleRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskzRoundscaleRoundSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		IF k[0]
			dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_maskz_roundscale_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRoundscaleRoundSs

func MaskzRoundscaleRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskzRoundscaleRoundSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		IF k[0]
			dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_maskz_roundscale_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRoundscaleSd

func MaskzRoundscaleSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskzRoundscaleSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

IF k[0]
	dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_maskz_roundscale_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRoundscaleSs

func MaskzRoundscaleSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskzRoundscaleSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

IF k[0]
	dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_maskz_roundscale_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzRsqrt14Pd

func MaskzRsqrt14Pd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzRsqrt14Pd: Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRSQRT14PD'. Intrinsic: '_mm_maskz_rsqrt14_pd'. Requires AVX512F.

func MaskzRsqrt14Ps

func MaskzRsqrt14Ps(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzRsqrt14Ps: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRSQRT14PS'. Intrinsic: '_mm_maskz_rsqrt14_ps'. Requires AVX512F.

func MaskzRsqrt14Sd

func MaskzRsqrt14Sd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzRsqrt14Sd: Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SD'. Intrinsic: '_mm_maskz_rsqrt14_sd'. Requires AVX512F.

func MaskzRsqrt14Ss

func MaskzRsqrt14Ss(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzRsqrt14Ss: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

IF k[0]
	dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SS'. Intrinsic: '_mm_maskz_rsqrt14_ss'. Requires AVX512F.

func MaskzScalefPd

func MaskzScalefPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm_maskz_scalef_pd'. Requires AVX512F.

func MaskzScalefPs

func MaskzScalefPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm_maskz_scalef_ps'. Requires AVX512F.

func MaskzScalefRoundSd

func MaskzScalefRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzScalefRoundSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		IF k[0]
			dst[63:0] := SCALE(a[63:0], b[63:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_maskz_scalef_round_sd'. Requires AVX512F.

func MaskzScalefRoundSs

func MaskzScalefRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzScalefRoundSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[63:0]
		}

		IF k[0]
			dst[31:0] := SCALE(a[31:0], b[31:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_maskz_scalef_round_ss'. Requires AVX512F.

func MaskzScalefSd

func MaskzScalefSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzScalefSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

IF k[0]
	dst[63:0] := SCALE(a[63:0], b[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_maskz_scalef_sd'. Requires AVX512F.

func MaskzScalefSs

func MaskzScalefSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzScalefSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[63:0]
}

IF k[0]
	dst[31:0] := SCALE(a[31:0], b[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_maskz_scalef_ss'. Requires AVX512F.

func MaskzSet1Epi32

func MaskzSet1Epi32(k x86.Mmask8, a int) (dst x86.M128i)

MaskzSet1Epi32: Broadcast 32-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[31:0]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_maskz_set1_epi32'. Requires AVX512F.

func MaskzSet1Epi64

func MaskzSet1Epi64(k x86.Mmask8, a int64) (dst x86.M128i)

MaskzSet1Epi64: Broadcast 64-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_maskz_set1_epi64'. Requires AVX512F.

func MaskzShuffleEpi32

func MaskzShuffleEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzShuffleEpi32: Shuffle 32-bit integers in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm_maskz_shuffle_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzShufflePd

func MaskzShufflePd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskzShufflePd: Shuffle double-precision (64-bit) floating-point elements using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSHUFPD'. Intrinsic: '_mm_maskz_shuffle_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzShufflePs

func MaskzShufflePs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskzShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSHUFPS'. Intrinsic: '_mm_maskz_shuffle_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSllEpi32

func MaskzSllEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm_maskz_sll_epi32'. Requires AVX512F.

func MaskzSllEpi64

func MaskzSllEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm_maskz_sll_epi64'. Requires AVX512F.

func MaskzSlliEpi32

func MaskzSlliEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm_maskz_slli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSlliEpi64

func MaskzSlliEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm_maskz_slli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSllvEpi32

func MaskzSllvEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm_maskz_sllv_epi32'. Requires AVX512F.

func MaskzSllvEpi64

func MaskzSllvEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm_maskz_sllv_epi64'. Requires AVX512F.

func MaskzSqrtPd

func MaskzSqrtPd(k x86.Mmask8, a x86.M128d) (dst x86.M128d)

MaskzSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SQRT(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSQRTPD'. Intrinsic: '_mm_maskz_sqrt_pd'. Requires AVX512F.

func MaskzSqrtPs

func MaskzSqrtPs(k x86.Mmask8, a x86.M128) (dst x86.M128)

MaskzSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SQRT(a[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSQRTPS'. Intrinsic: '_mm_maskz_sqrt_ps'. Requires AVX512F.

func MaskzSqrtRoundSd

func MaskzSqrtRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzSqrtRoundSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := SQRT(a[63:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSQRTSD'. Intrinsic: '_mm_maskz_sqrt_round_sd'. Requires AVX512F.

func MaskzSqrtRoundSs

func MaskzSqrtRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzSqrtRoundSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := SQRT(a[31:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSQRTSS'. Intrinsic: '_mm_maskz_sqrt_round_ss'. Requires AVX512F.

func MaskzSqrtSd

func MaskzSqrtSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzSqrtSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := SQRT(a[63:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VSQRTSD'. Intrinsic: '_mm_maskz_sqrt_sd'. Requires AVX512F.

func MaskzSqrtSs

func MaskzSqrtSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzSqrtSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := SQRT(a[31:0])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VSQRTSS'. Intrinsic: '_mm_maskz_sqrt_ss'. Requires AVX512F.

func MaskzSraEpi32

func MaskzSraEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm_maskz_sra_epi32'. Requires AVX512F.

func MaskzSraEpi64

func MaskzSraEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_maskz_sra_epi64'. Requires AVX512F.

func MaskzSraiEpi32

func MaskzSraiEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := SignBit
		ELSE
			dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm_maskz_srai_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSraiEpi64

func MaskzSraiEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := SignBit
		ELSE
			dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_maskz_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSravEpi32

func MaskzSravEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm_maskz_srav_epi32'. Requires AVX512F.

func MaskzSravEpi64

func MaskzSravEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm_maskz_srav_epi64'. Requires AVX512F.

func MaskzSrlEpi32

func MaskzSrlEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF count[63:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm_maskz_srl_epi32'. Requires AVX512F.

func MaskzSrlEpi64

func MaskzSrlEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF count[63:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm_maskz_srl_epi64'. Requires AVX512F.

func MaskzSrliEpi32

func MaskzSrliEpi32(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		IF imm8[7:0] > 31
			dst[i+31:i] := 0
		ELSE
			dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm_maskz_srli_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSrliEpi64

func MaskzSrliEpi64(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		IF imm8[7:0] > 63
			dst[i+63:i] := 0
		ELSE
			dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm_maskz_srli_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzSrlvEpi32

func MaskzSrlvEpi32(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm_maskz_srlv_epi32'. Requires AVX512F.

func MaskzSrlvEpi64

func MaskzSrlvEpi64(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm_maskz_srlv_epi64'. Requires AVX512F.

func MaskzSubEpi32

func MaskzSubEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm_maskz_sub_epi32'. Requires AVX512F.

func MaskzSubEpi64

func MaskzSubEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm_maskz_sub_epi64'. Requires AVX512F.

func MaskzSubPd

func MaskzSubPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzSubPd: Subtract packed double-precision (64-bit) floating-point elements in 'b' from packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSUBPD'. Intrinsic: '_mm_maskz_sub_pd'. Requires AVX512F.

func MaskzSubPs

func MaskzSubPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzSubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSUBPS'. Intrinsic: '_mm_maskz_sub_ps'. Requires AVX512F.

func MaskzSubRoundSd

func MaskzSubRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MaskzSubRoundSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[63:0] := a[63:0] - b[63:0]
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VSUBSD'. Intrinsic: '_mm_maskz_sub_round_sd'. Requires AVX512F.

func MaskzSubRoundSs

func MaskzSubRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MaskzSubRoundSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		IF k[0]
			dst[31:0] := a[31:0] - b[31:0]
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VSUBSS'. Intrinsic: '_mm_maskz_sub_round_ss'. Requires AVX512F.

func MaskzSubSd

func MaskzSubSd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzSubSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

IF k[0]
	dst[63:0] := a[63:0] - b[63:0]
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VSUBSD'. Intrinsic: '_mm_maskz_sub_sd'. Requires AVX512F.

func MaskzSubSs

func MaskzSubSs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzSubSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

IF k[0]
	dst[31:0] := a[31:0] - b[31:0]
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VSUBSS'. Intrinsic: '_mm_maskz_sub_ss'. Requires AVX512F.

func MaskzTernarylogicEpi32

func MaskzTernarylogicEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)

MaskzTernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		FOR h := 0 to 31
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm_maskz_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzTernarylogicEpi64

func MaskzTernarylogicEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)

MaskzTernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst' using zeromask 'k' at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		FOR h := 0 to 63
			index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
			dst[i+h] := imm8[index[2:0]]
		ENDFOR
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm_maskz_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func MaskzUnpackhiEpi32

func MaskzUnpackhiEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm_maskz_unpackhi_epi32'. Requires AVX512F.

func MaskzUnpackhiEpi64

func MaskzUnpackhiEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm_maskz_unpackhi_epi64'. Requires AVX512F.

func MaskzUnpackhiPd

func MaskzUnpackhiPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzUnpackhiPd: Unpack and interleave double-precision (64-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKHPD'. Intrinsic: '_mm_maskz_unpackhi_pd'. Requires AVX512F.

func MaskzUnpackhiPs

func MaskzUnpackhiPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzUnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKHPS'. Intrinsic: '_mm_maskz_unpackhi_ps'. Requires AVX512F.

func MaskzUnpackloEpi32

func MaskzUnpackloEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackloEpi32: Unpack and interleave 32-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm_maskz_unpacklo_epi32'. Requires AVX512F.

func MaskzUnpackloEpi64

func MaskzUnpackloEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackloEpi64: Unpack and interleave 64-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm_maskz_unpacklo_epi64'. Requires AVX512F.

func MaskzUnpackloPd

func MaskzUnpackloPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzUnpackloPd: Unpack and interleave double-precision (64-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp_dst[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKLPD'. Intrinsic: '_mm_maskz_unpacklo_pd'. Requires AVX512F.

func MaskzUnpackloPs

func MaskzUnpackloPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzUnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp_dst[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VUNPCKLPS'. Intrinsic: '_mm_maskz_unpacklo_ps'. Requires AVX512F.

func MaskzXorEpi32

func MaskzXorEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzXorEpi32: Compute the bitwise XOR of packed 32-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPXORD'. Intrinsic: '_mm_maskz_xor_epi32'. Requires AVX512F.

func MaskzXorEpi64

func MaskzXorEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzXorEpi64: Compute the bitwise XOR of packed 64-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPXORQ'. Intrinsic: '_mm_maskz_xor_epi64'. Requires AVX512F.

func MaxEpi64

func MaxEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

MaxEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSQ'. Intrinsic: '_mm_max_epi64'. Requires AVX512F.

func MaxEpu64

func MaxEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)

MaxEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF a[i+63:i] > b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUQ'. Intrinsic: '_mm_max_epu64'. Requires AVX512F.

func MaxRoundSd

func MaxRoundSd(a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MaxRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	dst[63:0] := MAX(a[63:0], b[63:0])
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMAXSD'. Intrinsic: '_mm_max_round_sd'. Requires AVX512F.

func MaxRoundSs

func MaxRoundSs(a x86.M128, b x86.M128, sae int) (dst x86.M128)

MaxRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	dst[31:0] := MAX(a[31:0], b[31:0])
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMAXSS'. Intrinsic: '_mm_max_round_ss'. Requires AVX512F.

func MinEpi64

func MinEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

MinEpi64: Compare packed 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSQ'. Intrinsic: '_mm_min_epi64'. Requires AVX512F.

func MinEpu64

func MinEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)

MinEpu64: Compare packed unsigned 64-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF a[i+63:i] < b[i+63:i]
		dst[i+63:i] := a[i+63:i]
	ELSE
		dst[i+63:i] := b[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUQ'. Intrinsic: '_mm_min_epu64'. Requires AVX512F.

func MinRoundSd

func MinRoundSd(a x86.M128d, b x86.M128d, sae int) (dst x86.M128d)

MinRoundSd: Compare the lower double-precision (64-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst' , and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	dst[63:0] := MIN(a[63:0], b[63:0])
	dst[127:64] := a[127:64]
	dst[MAX:128] := 0

Instruction: 'VMINSD'. Intrinsic: '_mm_min_round_sd'. Requires AVX512F.

func MinRoundSs

func MinRoundSs(a x86.M128, b x86.M128, sae int) (dst x86.M128)

MinRoundSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	dst[31:0] := MIN(a[31:0], b[31:0])
	dst[127:32] := a[127:32]
	dst[MAX:128] := 0

Instruction: 'VMINSS'. Intrinsic: '_mm_min_round_ss'. Requires AVX512F.

func MulRoundSd

func MulRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

MulRoundSd: Multiply the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := a[63:0] * b[63:0]
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VMULSD'. Intrinsic: '_mm_mul_round_sd'. Requires AVX512F.

func MulRoundSs

func MulRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

MulRoundSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := a[31:0] * b[31:0]
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VMULSS'. Intrinsic: '_mm_mul_round_ss'. Requires AVX512F.

func Permutex2varEpi32

func Permutex2varEpi32(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

Permutex2varEpi32: Shuffle 32-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	off := idx[i+2:i]*32
	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2D, VPERMT2D'. Intrinsic: '_mm_permutex2var_epi32'. Requires AVX512F.

func Permutex2varEpi64

func Permutex2varEpi64(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

Permutex2varEpi64: Shuffle 64-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2Q, VPERMT2Q'. Intrinsic: '_mm_permutex2var_epi64'. Requires AVX512F.

func Permutex2varPd

func Permutex2varPd(a x86.M128d, idx x86.M128i, b x86.M128d) (dst x86.M128d)

Permutex2varPd: Shuffle double-precision (64-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	off := idx[i]*64
	dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PD, VPERMT2PD'. Intrinsic: '_mm_permutex2var_pd'. Requires AVX512F.

func Permutex2varPs

func Permutex2varPs(a x86.M128, idx x86.M128i, b x86.M128) (dst x86.M128)

Permutex2varPs: Shuffle single-precision (32-bit) floating-point elements in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	off := idx[i+1:i]*32
	dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2PS, VPERMT2PS'. Intrinsic: '_mm_permutex2var_ps'. Requires AVX512F.

func Rcp14Pd

func Rcp14Pd(a x86.M128d) (dst x86.M128d)

Rcp14Pd: Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PD'. Intrinsic: '_mm_rcp14_pd'. Requires AVX512F.

func Rcp14Ps

func Rcp14Ps(a x86.M128) (dst x86.M128)

Rcp14Ps: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 2^-14.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRCP14PS'. Intrinsic: '_mm_rcp14_ps'. Requires AVX512F.

func Rcp14Sd

func Rcp14Sd(a x86.M128d, b x86.M128d) (dst x86.M128d)

Rcp14Sd: Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

dst[63:0] := APPROXIMATE(1.0/b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRCP14SD'. Intrinsic: '_mm_rcp14_sd'. Requires AVX512F.

func Rcp14Ss

func Rcp14Ss(a x86.M128, b x86.M128) (dst x86.M128)

Rcp14Ss: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

dst[31:0] := APPROXIMATE(1.0/b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRCP14SS'. Intrinsic: '_mm_rcp14_ss'. Requires AVX512F.

func RolEpi32

func RolEpi32(a x86.M128i, imm8 byte) (dst x86.M128i)

RolEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLD'. Intrinsic: '_mm_rol_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RolEpi64

func RolEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)

RolEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in 'imm8', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLQ'. Intrinsic: '_mm_rol_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RolvEpi32

func RolvEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

RolvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVD'. Intrinsic: '_mm_rolv_epi32'. Requires AVX512F.

func RolvEpi64

func RolvEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

RolvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the left by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

LEFT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPROLVQ'. Intrinsic: '_mm_rolv_epi64'. Requires AVX512F.

func RorEpi32

func RorEpi32(a x86.M128i, imm8 byte) (dst x86.M128i)

RorEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORD'. Intrinsic: '_mm_ror_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RorEpi64

func RorEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)

RorEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in 'imm8', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORQ'. Intrinsic: '_mm_ror_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RorvEpi32

func RorvEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

RorvEpi32: Rotate the bits in each packed 32-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_DWORDS(src, count_src){
	count := count_src modulo 32
	RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVD'. Intrinsic: '_mm_rorv_epi32'. Requires AVX512F.

func RorvEpi64

func RorvEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

RorvEpi64: Rotate the bits in each packed 64-bit integer in 'a' to the right by the number of bits specified in the corresponding element of 'b', and store the results in 'dst'.

RIGHT_ROTATE_QWORDS(src, count_src){
	count := count_src modulo 64
	RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPRORVQ'. Intrinsic: '_mm_rorv_epi64'. Requires AVX512F.

func RoundscalePd

func RoundscalePd(a x86.M128d, imm8 byte) (dst x86.M128d)

RoundscalePd: Round packed double-precision (64-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPD'. Intrinsic: '_mm_roundscale_pd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RoundscalePs

func RoundscalePs(a x86.M128, imm8 byte) (dst x86.M128)

RoundscalePs: Round packed single-precision (32-bit) floating-point elements in 'a' to the number of fraction bits specified by 'imm8', and store the results in 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRNDSCALEPS'. Intrinsic: '_mm_roundscale_ps'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RoundscaleRoundSd

func RoundscaleRoundSd(a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

RoundscaleRoundSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPD(src[63:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
			1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
			2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
			3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
			ESAC

			dst[63:0] := 2^-M * tmp[63:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[63:0] != dst[63:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[63:0]
		}

		dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_roundscale_round_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RoundscaleRoundSs

func RoundscaleRoundSs(a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

RoundscaleRoundSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RoundTo_IntegerPS(src[31:0], imm8[7:0]){
			IF(imm8[2] == 1)
				rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
			ELSE
				rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
			FI

			M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

			CASE(rounding_direction)
			0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
			1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
			2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
			3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
			ESAC

			dst[31:0] := 2^-M * tmp[31:0] // scale back down

			IF imm8[3] == 0 //check SPE
				IF src[31:0] != dst[31:0] //check if precision has been lost
					set_precision() //set #PE
				FI
			FI
			RETURN dst[31:0]
		}

		dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_roundscale_round_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RoundscaleSd

func RoundscaleSd(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

RoundscaleSd: Round the lower double-precision (64-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

RoundTo_IntegerPD(src[63:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
	ESAC

	dst[63:0] := 2^-M * tmp[63:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[63:0] != dst[63:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[63:0]
}

dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESD'. Intrinsic: '_mm_roundscale_sd'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func RoundscaleSs

func RoundscaleSs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

RoundscaleSs: Round the lower single-precision (32-bit) floating-point element in 'a' to the number of fraction bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

RoundTo_IntegerPS(src[31:0], imm8[7:0]){
	IF(imm8[2] == 1)
		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
	ELSE
		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
	FI

	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)

	CASE(rounding_direction)
	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
	ESAC

	dst[31:0] := 2^-M * tmp[31:0] // scale back down

	IF imm8[3] == 0 //check SPE
		IF src[31:0] != dst[31:0] //check if precision has been lost
			set_precision() //set #PE
		FI
	FI
	RETURN dst[31:0]
}

dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VRNDSCALESS'. Intrinsic: '_mm_roundscale_ss'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func Rsqrt14Sd

func Rsqrt14Sd(a x86.M128d, b x86.M128d) (dst x86.M128d)

Rsqrt14Sd: Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'. The maximum relative error for this approximation is less than 2^-14.

dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SD'. Intrinsic: '_mm_rsqrt14_sd'. Requires AVX512F.

func Rsqrt14Ss

func Rsqrt14Ss(a x86.M128, b x86.M128) (dst x86.M128)

Rsqrt14Ss: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 2^-14.

dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRSQRT14SS'. Intrinsic: '_mm_rsqrt14_ss'. Requires AVX512F.

func ScalefPd

func ScalefPd(a x86.M128d, b x86.M128d) (dst x86.M128d)

ScalefPd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPD'. Intrinsic: '_mm_scalef_pd'. Requires AVX512F.

func ScalefPs

func ScalefPs(a x86.M128, b x86.M128) (dst x86.M128)

ScalefPs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', and store the results in 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[31:0]
}

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VSCALEFPS'. Intrinsic: '_mm_scalef_ps'. Requires AVX512F.

func ScalefRoundSd

func ScalefRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

ScalefRoundSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
			RETURN dst[63:0]
		}

		dst[63:0] := SCALE(a[63:0], b[63:0])
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_scalef_round_sd'. Requires AVX512F.

func ScalefRoundSs

func ScalefRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

ScalefRoundSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		SCALE(src1, src2){
			IF (src2 == NaN)
				IF (src2 == SNaN)
					RETURN QNAN(src2)
				FI
			ELSE IF (src1 == NaN)
				IF (src1 == SNaN)
					RETURN QNAN(src1)
				FI
				IF (src2 != INF)
					RETURN QNAN(src1)
				FI
			ELSE
				tmp_src2 := src2
				tmp_src1 := src1
				IF (src2 is denormal AND MXCSR.DAZ)
					tmp_src2 := 0
				FI
				IF (src1 is denormal AND MXCSR.DAZ)
					tmp_src1 := 0
				FI
			FI
			dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
			RETURN dst[63:0]
		}

		dst[31:0] := SCALE(a[31:0], b[31:0])
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_scalef_round_ss'. Requires AVX512F.

func ScalefSd

func ScalefSd(a x86.M128d, b x86.M128d) (dst x86.M128d)

ScalefSd: Scale the packed double-precision (64-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
	RETURN dst[63:0]
}

dst[63:0] := SCALE(a[63:0], b[63:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VSCALEFSD'. Intrinsic: '_mm_scalef_sd'. Requires AVX512F.

func ScalefSs

func ScalefSs(a x86.M128, b x86.M128) (dst x86.M128)

ScalefSs: Scale the packed single-precision (32-bit) floating-point elements in 'a' using values from 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

SCALE(src1, src2){
	IF (src2 == NaN)
		IF (src2 == SNaN)
			RETURN QNAN(src2)
		FI
	ELSE IF (src1 == NaN)
		IF (src1 == SNaN)
			RETURN QNAN(src1)
		FI
		IF (src2 != INF)
			RETURN QNAN(src1)
		FI
	ELSE
		tmp_src2 := src2
		tmp_src1 := src1
		IF (src2 is denormal AND MXCSR.DAZ)
			tmp_src2 := 0
		FI
		IF (src1 is denormal AND MXCSR.DAZ)
			tmp_src1 := 0
		FI
	FI
	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
	RETURN dst[63:0]
}

dst[31:0] := SCALE(a[31:0], b[31:0])
dst[127:32] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VSCALEFSS'. Intrinsic: '_mm_scalef_ss'. Requires AVX512F.

func SqrtRoundSd

func SqrtRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

SqrtRoundSd: Compute the square root of the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := SQRT(a[63:0])
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VSQRTSD'. Intrinsic: '_mm_sqrt_round_sd'. Requires AVX512F.

func SqrtRoundSs

func SqrtRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

SqrtRoundSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := SQRT(a[31:0])
		dst[127:32] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VSQRTSS'. Intrinsic: '_mm_sqrt_round_ss'. Requires AVX512F.

func SraEpi64

func SraEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)

SraEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_sra_epi64'. Requires AVX512F.

func SraiEpi64

func SraiEpi64(a x86.M128i, imm8 byte) (dst x86.M128i)

SraiEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := SignBit
	ELSE
		dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAQ'. Intrinsic: '_mm_srai_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func SravEpi64

func SravEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)

SravEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVQ'. Intrinsic: '_mm_srav_epi64'. Requires AVX512F.

func SubRoundSd

func SubRoundSd(a x86.M128d, b x86.M128d, rounding int) (dst x86.M128d)

SubRoundSd: Subtract the lower double-precision (64-bit) floating-point element in 'b' from the lower double-precision (64-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[63:0] := a[63:0] - b[63:0]
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VSUBSD'. Intrinsic: '_mm_sub_round_sd'. Requires AVX512F.

func SubRoundSs

func SubRoundSs(a x86.M128, b x86.M128, rounding int) (dst x86.M128)

SubRoundSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		dst[31:0] := a[31:0] - b[31:0]
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VSUBSS'. Intrinsic: '_mm_sub_round_ss'. Requires AVX512F.

func TernarylogicEpi32

func TernarylogicEpi32(a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)

TernarylogicEpi32: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 32-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 3
	i := j*32
	FOR h := 0 to 31
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGD'. Intrinsic: '_mm_ternarylogic_epi32'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func TernarylogicEpi64

func TernarylogicEpi64(a x86.M128i, b x86.M128i, c x86.M128i, imm8 byte) (dst x86.M128i)

TernarylogicEpi64: Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in 'imm8'. For each bit in each packed 64-bit integer, the corresponding bit from 'a', 'b', and 'c' are used to form a 3 bit index into 'imm8', and the value at that bit in 'imm8' is written to the corresponding bit in 'dst'.

FOR j := 0 to 1
	i := j*64
	FOR h := 0 to 63
		index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
		dst[i+h] := imm8[index[2:0]]
	ENDFOR
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPTERNLOGQ'. Intrinsic: '_mm_ternarylogic_epi64'. Requires AVX512F.

FIXME: Requires compiler support (has immediate)

func TestEpi32Mask

func TestEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestEpi32Mask: Compute the bitwise AND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 3
	i := j*32
	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTMD'. Intrinsic: '_mm_test_epi32_mask'. Requires AVX512F.

func TestEpi64Mask

func TestEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestEpi64Mask: Compute the bitwise AND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 1
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPTESTMQ'. Intrinsic: '_mm_test_epi64_mask'. Requires AVX512F.

func TestnEpi32Mask

func TestnEpi32Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestnEpi32Mask: Compute the bitwise NAND of packed 32-bit integers in 'a' and 'b', producing intermediate 32-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 3
	i := j*32
	k[j] := ((a[i+31:i] NAND b[i+31:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0

Instruction: 'VPTESTNMD'. Intrinsic: '_mm_testn_epi32_mask'. Requires AVX512F.

func TestnEpi64Mask

func TestnEpi64Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestnEpi64Mask: Compute the bitwise NAND of packed 64-bit integers in 'a' and 'b', producing intermediate 64-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 1
	i := j*64
	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:2] := 0

Instruction: 'VPTESTNMQ'. Intrinsic: '_mm_testn_epi64_mask'. Requires AVX512F.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL