avx512dq

package
v0.0.0-...-3878f85 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 23, 2017 License: MIT Imports: 1 Imported by: 0

Documentation

Overview

THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!

THEY DO NOT NOT CONTAIN WORKING INTRINSICS!

See https://github.com/klauspost/intrinsics

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func BroadcastI32x2

func BroadcastI32x2(a x86.M128i) (dst x86.M128i)

BroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of "dst.

FOR j := 0 to 3
	i := j*32
	n := (j mod 2)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm_broadcast_i32x2'. Requires AVX512DQ.

func Cvtepi64Pd

func Cvtepi64Pd(a x86.M128i) (dst x86.M128d)

Cvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm_cvtepi64_pd'. Requires AVX512DQ.

func Cvtepi64Ps

func Cvtepi64Ps(a x86.M128i) (dst x86.M128)

Cvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm_cvtepi64_ps'. Requires AVX512DQ.

func Cvtepu64Pd

func Cvtepu64Pd(a x86.M128i) (dst x86.M128d)

Cvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm_cvtepu64_pd'. Requires AVX512DQ.

func Cvtepu64Ps

func Cvtepu64Ps(a x86.M128i) (dst x86.M128)

Cvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm_cvtepu64_ps'. Requires AVX512DQ.

func CvtpdEpi64

func CvtpdEpi64(a x86.M128d) (dst x86.M128i)

CvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm_cvtpd_epi64'. Requires AVX512DQ.

func CvtpdEpu64

func CvtpdEpu64(a x86.M128d) (dst x86.M128i)

CvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm_cvtpd_epu64'. Requires AVX512DQ.

func CvtpsEpi64

func CvtpsEpi64(a x86.M128) (dst x86.M128i)

CvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm_cvtps_epi64'. Requires AVX512DQ.

func CvtpsEpu64

func CvtpsEpu64(a x86.M128) (dst x86.M128i)

CvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm_cvtps_epu64'. Requires AVX512DQ.

func CvttpdEpi64

func CvttpdEpi64(a x86.M128d) (dst x86.M128i)

CvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm_cvttpd_epi64'. Requires AVX512DQ.

func CvttpdEpu64

func CvttpdEpu64(a x86.M128d) (dst x86.M128i)

CvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm_cvttpd_epu64'. Requires AVX512DQ.

func CvttpsEpi64

func CvttpsEpi64(a x86.M128) (dst x86.M128i)

CvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm_cvttps_epi64'. Requires AVX512DQ.

func CvttpsEpu64

func CvttpsEpu64(a x86.M128) (dst x86.M128i)

CvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm_cvttps_epu64'. Requires AVX512DQ.

func FpclassPdMask

func FpclassPdMask(a x86.M128d, imm8 byte) (dst x86.Mmask8)

FpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 1
			i := j*64
			k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
		ENDFOR
		k[MAX:2] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func FpclassPsMask

func FpclassPsMask(a x86.M128, imm8 byte) (dst x86.Mmask8)

FpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 3
			i := j*32
			k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
		ENDFOR
		k[MAX:4] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func FpclassSdMask

func FpclassSdMask(a x86.M128d, imm8 byte) (dst x86.Mmask8)

FpclassSdMask: Test the lower double-precision (64-bit) floating-point element in 'a' for special categories specified by 'imm8', and store the result in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
		k[MAX:1] := 0

Instruction: 'VFPCLASSSD'. Intrinsic: '_mm_fpclass_sd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func FpclassSsMask

func FpclassSsMask(a x86.M128, imm8 byte) (dst x86.Mmask8)

FpclassSsMask: Test the lower single-precision (32-bit) floating-point element in 'a' for special categories specified by 'imm8', and store the result in mask vector 'k.

	'imm" can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
		k[MAX:1] := 0

Instruction: 'VFPCLASSSS'. Intrinsic: '_mm_fpclass_ss_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256BroadcastF32x2

func M256BroadcastF32x2(a x86.M128) (dst x86.M256)

M256BroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm256_broadcast_f32x2'. Requires AVX512DQ.

func M256BroadcastF64x2

func M256BroadcastF64x2(a x86.M128d) (dst x86.M256d)

M256BroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm256_broadcast_f64x2'. Requires AVX512DQ.

func M256BroadcastI32x2

func M256BroadcastI32x2(a x86.M128i) (dst x86.M256i)

M256BroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of "dst.

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm256_broadcast_i32x2'. Requires AVX512DQ.

func M256BroadcastI64x2

func M256BroadcastI64x2(a x86.M128i) (dst x86.M256i)

M256BroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm256_broadcast_i64x2'. Requires AVX512DQ.

func M256Cvtepi64Pd

func M256Cvtepi64Pd(a x86.M256i) (dst x86.M256d)

M256Cvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm256_cvtepi64_pd'. Requires AVX512DQ.

func M256Cvtepi64Ps

func M256Cvtepi64Ps(a x86.M256i) (dst x86.M128)

M256Cvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm256_cvtepi64_ps'. Requires AVX512DQ.

func M256Cvtepu64Pd

func M256Cvtepu64Pd(a x86.M256i) (dst x86.M256d)

M256Cvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm256_cvtepu64_pd'. Requires AVX512DQ.

func M256Cvtepu64Ps

func M256Cvtepu64Ps(a x86.M256i) (dst x86.M128)

M256Cvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm256_cvtepu64_ps'. Requires AVX512DQ.

func M256CvtpdEpi64

func M256CvtpdEpi64(a x86.M256d) (dst x86.M256i)

M256CvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm256_cvtpd_epi64'. Requires AVX512DQ.

func M256CvtpdEpu64

func M256CvtpdEpu64(a x86.M256d) (dst x86.M256i)

M256CvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm256_cvtpd_epu64'. Requires AVX512DQ.

func M256CvtpsEpi64

func M256CvtpsEpi64(a x86.M128) (dst x86.M256i)

M256CvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm256_cvtps_epi64'. Requires AVX512DQ.

func M256CvtpsEpu64

func M256CvtpsEpu64(a x86.M128) (dst x86.M256i)

M256CvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm256_cvtps_epu64'. Requires AVX512DQ.

func M256CvttpdEpi64

func M256CvttpdEpi64(a x86.M256d) (dst x86.M256i)

M256CvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm256_cvttpd_epi64'. Requires AVX512DQ.

func M256CvttpdEpu64

func M256CvttpdEpu64(a x86.M256d) (dst x86.M256i)

M256CvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm256_cvttpd_epu64'. Requires AVX512DQ.

func M256CvttpsEpi64

func M256CvttpsEpi64(a x86.M128) (dst x86.M256i)

M256CvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm256_cvttps_epi64'. Requires AVX512DQ.

func M256CvttpsEpu64

func M256CvttpsEpu64(a x86.M128) (dst x86.M256i)

M256CvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm256_cvttps_epu64'. Requires AVX512DQ.

func M256Extractf64x2Pd

func M256Extractf64x2Pd(a x86.M256d, imm8 byte) (dst x86.M128d)

M256Extractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm256_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256Extracti64x2Epi64

func M256Extracti64x2Epi64(a x86.M256i, imm8 byte) (dst x86.M128i)

M256Extracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm256_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256FpclassPdMask

func M256FpclassPdMask(a x86.M256d, imm8 byte) (dst x86.Mmask8)

M256FpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 3
			i := j*64
			k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
		ENDFOR
		k[MAX:4] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm256_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256FpclassPsMask

func M256FpclassPsMask(a x86.M256, imm8 byte) (dst x86.Mmask8)

M256FpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 7
			i := j*32
			k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
		ENDFOR
		k[MAX:8] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm256_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256Insertf64x2

func M256Insertf64x2(a x86.M256d, b x86.M128d, imm8 byte) (dst x86.M256d)

M256Insertf64x2: Copy 'a' to 'dst', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[255:0] := a[255:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm256_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256Inserti64x2

func M256Inserti64x2(a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256Inserti64x2: Copy 'a' to 'dst', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[255:0] := a[255:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm256_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskAndPd

func M256MaskAndPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm256_mask_and_pd'. Requires AVX512DQ.

func M256MaskAndPs

func M256MaskAndPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm256_mask_and_ps'. Requires AVX512DQ.

func M256MaskAndnotPd

func M256MaskAndnotPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm256_mask_andnot_pd'. Requires AVX512DQ.

func M256MaskAndnotPs

func M256MaskAndnotPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm256_mask_andnot_ps'. Requires AVX512DQ.

func M256MaskBroadcastF32x2

func M256MaskBroadcastF32x2(src x86.M256, k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskBroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm256_mask_broadcast_f32x2'. Requires AVX512DQ.

func M256MaskBroadcastF64x2

func M256MaskBroadcastF64x2(src x86.M256d, k x86.Mmask8, a x86.M128d) (dst x86.M256d)

M256MaskBroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm256_mask_broadcast_f64x2'. Requires AVX512DQ.

func M256MaskBroadcastI32x2

func M256MaskBroadcastI32x2(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm256_mask_broadcast_i32x2'. Requires AVX512DQ.

func M256MaskBroadcastI64x2

func M256MaskBroadcastI64x2(src x86.M256i, k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm256_mask_broadcast_i64x2'. Requires AVX512DQ.

func M256MaskCvtepi64Pd

func M256MaskCvtepi64Pd(src x86.M256d, k x86.Mmask8, a x86.M256i) (dst x86.M256d)

M256MaskCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm256_mask_cvtepi64_pd'. Requires AVX512DQ.

func M256MaskCvtepi64Ps

func M256MaskCvtepi64Ps(src x86.M128, k x86.Mmask8, a x86.M256i) (dst x86.M128)

M256MaskCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm256_mask_cvtepi64_ps'. Requires AVX512DQ.

func M256MaskCvtepu64Pd

func M256MaskCvtepu64Pd(src x86.M256d, k x86.Mmask8, a x86.M256i) (dst x86.M256d)

M256MaskCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm256_mask_cvtepu64_pd'. Requires AVX512DQ.

func M256MaskCvtepu64Ps

func M256MaskCvtepu64Ps(src x86.M128, k x86.Mmask8, a x86.M256i) (dst x86.M128)

M256MaskCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm256_mask_cvtepu64_ps'. Requires AVX512DQ.

func M256MaskCvtpdEpi64

func M256MaskCvtpdEpi64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm256_mask_cvtpd_epi64'. Requires AVX512DQ.

func M256MaskCvtpdEpu64

func M256MaskCvtpdEpu64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm256_mask_cvtpd_epu64'. Requires AVX512DQ.

func M256MaskCvtpsEpi64

func M256MaskCvtpsEpi64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm256_mask_cvtps_epi64'. Requires AVX512DQ.

func M256MaskCvtpsEpu64

func M256MaskCvtpsEpu64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm256_mask_cvtps_epu64'. Requires AVX512DQ.

func M256MaskCvttpdEpi64

func M256MaskCvttpdEpi64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm256_mask_cvttpd_epi64'. Requires AVX512DQ.

func M256MaskCvttpdEpu64

func M256MaskCvttpdEpu64(src x86.M256i, k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm256_mask_cvttpd_epu64'. Requires AVX512DQ.

func M256MaskCvttpsEpi64

func M256MaskCvttpsEpi64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm256_mask_cvttps_epi64'. Requires AVX512DQ.

func M256MaskCvttpsEpu64

func M256MaskCvttpsEpu64(src x86.M256i, k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm256_mask_cvttps_epu64'. Requires AVX512DQ.

func M256MaskExtractf64x2Pd

func M256MaskExtractf64x2Pd(src x86.M128d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M128d)

M256MaskExtractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm256_mask_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskExtracti64x2Epi64

func M256MaskExtracti64x2Epi64(src x86.M128i, k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)

M256MaskExtracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm256_mask_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskFpclassPdMask

func M256MaskFpclassPdMask(k1 x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.Mmask8)

M256MaskFpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 3
			i := j*64
			IF k1[j]
				k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:4] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm256_mask_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskFpclassPsMask

func M256MaskFpclassPsMask(k1 x86.Mmask8, a x86.M256, imm8 byte) (dst x86.Mmask8)

M256MaskFpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 7
			i := j*32
			IF k1[j]
				k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:8] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm256_mask_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskInsertf64x2

func M256MaskInsertf64x2(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M128d, imm8 byte) (dst x86.M256d)

M256MaskInsertf64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm256_mask_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskInserti64x2

func M256MaskInserti64x2(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256MaskInserti64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm256_mask_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskMulloEpi64

func M256MaskMulloEpi64(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm256_mask_mullo_epi64'. Requires AVX512DQ.

func M256MaskOrPd

func M256MaskOrPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VORPD'. Intrinsic: '_mm256_mask_or_pd'. Requires AVX512DQ.

func M256MaskOrPs

func M256MaskOrPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VORPS'. Intrinsic: '_mm256_mask_or_ps'. Requires AVX512DQ.

func M256MaskRangePd

func M256MaskRangePd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm256_mask_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskRangePs

func M256MaskRangePs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm256_mask_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskReducePd

func M256MaskReducePd(src x86.M256d, k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm256_mask_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskReducePs

func M256MaskReducePs(src x86.M256, k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm256_mask_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskXorPd

func M256MaskXorPd(src x86.M256d, k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm256_mask_xor_pd'. Requires AVX512DQ.

func M256MaskXorPs

func M256MaskXorPs(src x86.M256, k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm256_mask_xor_ps'. Requires AVX512DQ.

func M256MaskzAndPd

func M256MaskzAndPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm256_maskz_and_pd'. Requires AVX512DQ.

func M256MaskzAndPs

func M256MaskzAndPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm256_maskz_and_ps'. Requires AVX512DQ.

func M256MaskzAndnotPd

func M256MaskzAndnotPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm256_maskz_andnot_pd'. Requires AVX512DQ.

func M256MaskzAndnotPs

func M256MaskzAndnotPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm256_maskz_andnot_ps'. Requires AVX512DQ.

func M256MaskzBroadcastF32x2

func M256MaskzBroadcastF32x2(k x86.Mmask8, a x86.M128) (dst x86.M256)

M256MaskzBroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm256_maskz_broadcast_f32x2'. Requires AVX512DQ.

func M256MaskzBroadcastF64x2

func M256MaskzBroadcastF64x2(k x86.Mmask8, a x86.M128d) (dst x86.M256d)

M256MaskzBroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm256_maskz_broadcast_f64x2'. Requires AVX512DQ.

func M256MaskzBroadcastI32x2

func M256MaskzBroadcastI32x2(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm256_maskz_broadcast_i32x2'. Requires AVX512DQ.

func M256MaskzBroadcastI64x2

func M256MaskzBroadcastI64x2(k x86.Mmask8, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm256_maskz_broadcast_i64x2'. Requires AVX512DQ.

func M256MaskzCvtepi64Pd

func M256MaskzCvtepi64Pd(k x86.Mmask8, a x86.M256i) (dst x86.M256d)

M256MaskzCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm256_maskz_cvtepi64_pd'. Requires AVX512DQ.

func M256MaskzCvtepi64Ps

func M256MaskzCvtepi64Ps(k x86.Mmask8, a x86.M256i) (dst x86.M128)

M256MaskzCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm256_maskz_cvtepi64_ps'. Requires AVX512DQ.

func M256MaskzCvtepu64Pd

func M256MaskzCvtepu64Pd(k x86.Mmask8, a x86.M256i) (dst x86.M256d)

M256MaskzCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm256_maskz_cvtepu64_pd'. Requires AVX512DQ.

func M256MaskzCvtepu64Ps

func M256MaskzCvtepu64Ps(k x86.Mmask8, a x86.M256i) (dst x86.M128)

M256MaskzCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm256_maskz_cvtepu64_ps'. Requires AVX512DQ.

func M256MaskzCvtpdEpi64

func M256MaskzCvtpdEpi64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskzCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm256_maskz_cvtpd_epi64'. Requires AVX512DQ.

func M256MaskzCvtpdEpu64

func M256MaskzCvtpdEpu64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskzCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm256_maskz_cvtpd_epu64'. Requires AVX512DQ.

func M256MaskzCvtpsEpi64

func M256MaskzCvtpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskzCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm256_maskz_cvtps_epi64'. Requires AVX512DQ.

func M256MaskzCvtpsEpu64

func M256MaskzCvtpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskzCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm256_maskz_cvtps_epu64'. Requires AVX512DQ.

func M256MaskzCvttpdEpi64

func M256MaskzCvttpdEpi64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskzCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm256_maskz_cvttpd_epi64'. Requires AVX512DQ.

func M256MaskzCvttpdEpu64

func M256MaskzCvttpdEpu64(k x86.Mmask8, a x86.M256d) (dst x86.M256i)

M256MaskzCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm256_maskz_cvttpd_epu64'. Requires AVX512DQ.

func M256MaskzCvttpsEpi64

func M256MaskzCvttpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskzCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm256_maskz_cvttps_epi64'. Requires AVX512DQ.

func M256MaskzCvttpsEpu64

func M256MaskzCvttpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M256i)

M256MaskzCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm256_maskz_cvttps_epu64'. Requires AVX512DQ.

func M256MaskzExtractf64x2Pd

func M256MaskzExtractf64x2Pd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M128d)

M256MaskzExtractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm256_maskz_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzExtracti64x2Epi64

func M256MaskzExtracti64x2Epi64(k x86.Mmask8, a x86.M256i, imm8 byte) (dst x86.M128i)

M256MaskzExtracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm256_maskz_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzInsertf64x2

func M256MaskzInsertf64x2(k x86.Mmask8, a x86.M256d, b x86.M128d, imm8 byte) (dst x86.M256d)

M256MaskzInsertf64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm256_maskz_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzInserti64x2

func M256MaskzInserti64x2(k x86.Mmask8, a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256MaskzInserti64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm256_maskz_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzMulloEpi64

func M256MaskzMulloEpi64(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm256_maskz_mullo_epi64'. Requires AVX512DQ.

func M256MaskzOrPd

func M256MaskzOrPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VORPD'. Intrinsic: '_mm256_maskz_or_pd'. Requires AVX512DQ.

func M256MaskzOrPs

func M256MaskzOrPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VORPS'. Intrinsic: '_mm256_maskz_or_ps'. Requires AVX512DQ.

func M256MaskzRangePd

func M256MaskzRangePd(k x86.Mmask8, a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm256_maskz_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzRangePs

func M256MaskzRangePs(k x86.Mmask8, a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256MaskzRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm256_maskz_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzReducePd

func M256MaskzReducePd(k x86.Mmask8, a x86.M256d, imm8 byte) (dst x86.M256d)

M256MaskzReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm256_maskz_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzReducePs

func M256MaskzReducePs(k x86.Mmask8, a x86.M256, imm8 byte) (dst x86.M256)

M256MaskzReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm256_maskz_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256MaskzXorPd

func M256MaskzXorPd(k x86.Mmask8, a x86.M256d, b x86.M256d) (dst x86.M256d)

M256MaskzXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm256_maskz_xor_pd'. Requires AVX512DQ.

func M256MaskzXorPs

func M256MaskzXorPs(k x86.Mmask8, a x86.M256, b x86.M256) (dst x86.M256)

M256MaskzXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm256_maskz_xor_ps'. Requires AVX512DQ.

func M256Movepi32Mask

func M256Movepi32Mask(a x86.M256i) (dst x86.Mmask8)

M256Movepi32Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 32-bit integer in 'a'.

FOR j := 0 to 7
	i := j*32
	IF a[i+31]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPMOVD2M'. Intrinsic: '_mm256_movepi32_mask'. Requires AVX512DQ.

func M256Movepi64Mask

func M256Movepi64Mask(a x86.M256i) (dst x86.Mmask8)

M256Movepi64Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 64-bit integer in 'a'.

FOR j := 0 to 3
	i := j*64
	IF a[i+63]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPMOVQ2M'. Intrinsic: '_mm256_movepi64_mask'. Requires AVX512DQ.

func M256MovmEpi32

func M256MovmEpi32(k x86.Mmask8) (dst x86.M256i)

M256MovmEpi32: Set each packed 32-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := 0xFFFFFFFF
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVM2D'. Intrinsic: '_mm256_movm_epi32'. Requires AVX512DQ.

func M256MovmEpi64

func M256MovmEpi64(k x86.Mmask8) (dst x86.M256i)

M256MovmEpi64: Set each packed 64-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 3
	i := j*64
	IF k[j]
		dst[i+63:i] := 0xFFFFFFFFffffffff
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVM2Q'. Intrinsic: '_mm256_movm_epi64'. Requires AVX512DQ.

func M256MulloEpi64

func M256MulloEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst'.

FOR j := 0 to 3
	i := j*64
	tmp[127:0] := a[i+63:i] * b[i+63:i]
	dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm256_mullo_epi64'. Requires AVX512DQ.

func M256RangePd

func M256RangePd(a x86.M256d, b x86.M256d, imm8 byte) (dst x86.M256d)

M256RangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm256_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256RangePs

func M256RangePs(a x86.M256, b x86.M256, imm8 byte) (dst x86.M256)

M256RangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm256_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256ReducePd

func M256ReducePd(a x86.M256d, imm8 byte) (dst x86.M256d)

M256ReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm256_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M256ReducePs

func M256ReducePs(a x86.M256, imm8 byte) (dst x86.M256)

M256ReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm256_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512AndPd

func M512AndPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512AndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm512_and_pd'. Requires AVX512DQ.

func M512AndPs

func M512AndPs(a x86.M512, b x86.M512) (dst x86.M512)

M512AndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm512_and_ps'. Requires AVX512DQ.

func M512AndnotPd

func M512AndnotPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512AndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm512_andnot_pd'. Requires AVX512DQ.

func M512AndnotPs

func M512AndnotPs(a x86.M512, b x86.M512) (dst x86.M512)

M512AndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm512_andnot_ps'. Requires AVX512DQ.

func M512BroadcastF32x2

func M512BroadcastF32x2(a x86.M128) (dst x86.M512)

M512BroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm512_broadcast_f32x2'. Requires AVX512DQ.

func M512BroadcastF32x8

func M512BroadcastF32x8(a x86.M256) (dst x86.M512)

M512BroadcastF32x8: Broadcast the 8 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X8'. Intrinsic: '_mm512_broadcast_f32x8'. Requires AVX512DQ.

func M512BroadcastF64x2

func M512BroadcastF64x2(a x86.M128d) (dst x86.M512d)

M512BroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm512_broadcast_f64x2'. Requires AVX512DQ.

func M512BroadcastI32x2

func M512BroadcastI32x2(a x86.M128i) (dst x86.M512i)

M512BroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of "dst.

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm512_broadcast_i32x2'. Requires AVX512DQ.

func M512BroadcastI32x8

func M512BroadcastI32x8(a x86.M256i) (dst x86.M512i)

M512BroadcastI32x8: Broadcast the 8 packed 32-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X8'. Intrinsic: '_mm512_broadcast_i32x8'. Requires AVX512DQ.

func M512BroadcastI64x2

func M512BroadcastI64x2(a x86.M128i) (dst x86.M512i)

M512BroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm512_broadcast_i64x2'. Requires AVX512DQ.

func M512CvtRoundepi64Pd

func M512CvtRoundepi64Pd(a x86.M512i, rounding int) (dst x86.M512d)

M512CvtRoundepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_cvt_roundepi64_pd'. Requires AVX512DQ.

func M512CvtRoundepi64Ps

func M512CvtRoundepi64Ps(a x86.M512i, rounding int) (dst x86.M256)

M512CvtRoundepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_cvt_roundepi64_ps'. Requires AVX512DQ.

func M512CvtRoundepu64Pd

func M512CvtRoundepu64Pd(a x86.M512i, rounding int) (dst x86.M512d)

M512CvtRoundepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_cvt_roundepu64_pd'. Requires AVX512DQ.

func M512CvtRoundepu64Ps

func M512CvtRoundepu64Ps(a x86.M512i, rounding int) (dst x86.M256)

M512CvtRoundepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_cvt_roundepu64_ps'. Requires AVX512DQ.

func M512CvtRoundpdEpi64

func M512CvtRoundpdEpi64(a x86.M512d, rounding int) (dst x86.M512i)

M512CvtRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_cvt_roundpd_epi64'. Requires AVX512DQ.

func M512CvtRoundpdEpu64

func M512CvtRoundpdEpu64(a x86.M512d, rounding int) (dst x86.M512i)

M512CvtRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_cvt_roundpd_epu64'. Requires AVX512DQ.

func M512CvtRoundpsEpi64

func M512CvtRoundpsEpi64(a x86.M256, rounding int) (dst x86.M512i)

M512CvtRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_cvt_roundps_epi64'. Requires AVX512DQ.

func M512CvtRoundpsEpu64

func M512CvtRoundpsEpu64(a x86.M256, rounding int) (dst x86.M512i)

M512CvtRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_cvt_roundps_epu64'. Requires AVX512DQ.

func M512Cvtepi64Pd

func M512Cvtepi64Pd(a x86.M512i) (dst x86.M512d)

M512Cvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_cvtepi64_pd'. Requires AVX512DQ.

func M512Cvtepi64Ps

func M512Cvtepi64Ps(a x86.M512i) (dst x86.M256)

M512Cvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_cvtepi64_ps'. Requires AVX512DQ.

func M512Cvtepu64Pd

func M512Cvtepu64Pd(a x86.M512i) (dst x86.M512d)

M512Cvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_cvtepu64_pd'. Requires AVX512DQ.

func M512Cvtepu64Ps

func M512Cvtepu64Ps(a x86.M512i) (dst x86.M256)

M512Cvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_cvtepu64_ps'. Requires AVX512DQ.

func M512CvtpdEpi64

func M512CvtpdEpi64(a x86.M512d) (dst x86.M512i)

M512CvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_cvtpd_epi64'. Requires AVX512DQ.

func M512CvtpdEpu64

func M512CvtpdEpu64(a x86.M512d) (dst x86.M512i)

M512CvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_cvtpd_epu64'. Requires AVX512DQ.

func M512CvtpsEpi64

func M512CvtpsEpi64(a x86.M256) (dst x86.M512i)

M512CvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_cvtps_epi64'. Requires AVX512DQ.

func M512CvtpsEpu64

func M512CvtpsEpu64(a x86.M256) (dst x86.M512i)

M512CvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_cvtps_epu64'. Requires AVX512DQ.

func M512CvttRoundpdEpi64

func M512CvttRoundpdEpi64(a x86.M512d, sae int) (dst x86.M512i)

M512CvttRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'. Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_cvtt_roundpd_epi64'. Requires AVX512DQ.

func M512CvttRoundpdEpu64

func M512CvttRoundpdEpu64(a x86.M512d, sae int) (dst x86.M512i)

M512CvttRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'. Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_cvtt_roundpd_epu64'. Requires AVX512DQ.

func M512CvttRoundpsEpi64

func M512CvttRoundpsEpi64(a x86.M256, sae int) (dst x86.M512i)

M512CvttRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'. Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_cvtt_roundps_epi64'. Requires AVX512DQ.

func M512CvttRoundpsEpu64

func M512CvttRoundpsEpu64(a x86.M256, sae int) (dst x86.M512i)

M512CvttRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'. Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_cvtt_roundps_epu64'. Requires AVX512DQ.

func M512CvttpdEpi64

func M512CvttpdEpi64(a x86.M512d) (dst x86.M512i)

M512CvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_cvttpd_epi64'. Requires AVX512DQ.

func M512CvttpdEpu64

func M512CvttpdEpu64(a x86.M512d) (dst x86.M512i)

M512CvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_cvttpd_epu64'. Requires AVX512DQ.

func M512CvttpsEpi64

func M512CvttpsEpi64(a x86.M256) (dst x86.M512i)

M512CvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_cvttps_epi64'. Requires AVX512DQ.

func M512CvttpsEpu64

func M512CvttpsEpu64(a x86.M256) (dst x86.M512i)

M512CvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	l := j*32
	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_cvttps_epu64'. Requires AVX512DQ.

func M512Extractf32x8Ps

func M512Extractf32x8Ps(a x86.M512, imm8 byte) (dst x86.M256)

M512Extractf32x8Ps: Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0

Instruction: 'VEXTRACTF32X8'. Intrinsic: '_mm512_extractf32x8_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Extractf64x2Pd

func M512Extractf64x2Pd(a x86.M512d, imm8 byte) (dst x86.M128d)

M512Extractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm512_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Extracti32x8Epi32

func M512Extracti32x8Epi32(a x86.M512i, imm8 byte) (dst x86.M256i)

M512Extracti32x8Epi32: Extract 256 bits (composed of 8 packed 32-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0

Instruction: 'VEXTRACTI32X8'. Intrinsic: '_mm512_extracti32x8_epi32'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Extracti64x2Epi64

func M512Extracti64x2Epi64(a x86.M512i, imm8 byte) (dst x86.M128i)

M512Extracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm512_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512FpclassPdMask

func M512FpclassPdMask(a x86.M512d, imm8 byte) (dst x86.Mmask8)

M512FpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 7
			i := j*64
			k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
		ENDFOR
		k[MAX:8] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm512_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512FpclassPsMask

func M512FpclassPsMask(a x86.M512, imm8 byte) (dst x86.Mmask16)

M512FpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k'.

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 15
			i := j*32
			k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
		ENDFOR
		k[MAX:16] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm512_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Insertf32x8

func M512Insertf32x8(a x86.M512, b x86.M256, imm8 byte) (dst x86.M512)

M512Insertf32x8: Copy 'a' to 'dst', then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTF32X8'. Intrinsic: '_mm512_insertf32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Insertf64x2

func M512Insertf64x2(a x86.M512d, b x86.M128d, imm8 byte) (dst x86.M512d)

M512Insertf64x2: Copy 'a' to 'dst', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm512_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Inserti32x8

func M512Inserti32x8(a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512Inserti32x8: Copy 'a' to 'dst', then insert 256 bits (composed of 8 packed 32-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE imm8[7:0] of
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTI32X8'. Intrinsic: '_mm512_inserti32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512Inserti64x2

func M512Inserti64x2(a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512Inserti64x2: Copy 'a' to 'dst', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'dst' at the location specified by 'imm8'.

dst[511:0] := a[511:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm512_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskAndPd

func M512MaskAndPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm512_mask_and_pd'. Requires AVX512DQ.

func M512MaskAndPs

func M512MaskAndPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm512_mask_and_ps'. Requires AVX512DQ.

func M512MaskAndnotPd

func M512MaskAndnotPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm512_mask_andnot_pd'. Requires AVX512DQ.

func M512MaskAndnotPs

func M512MaskAndnotPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm512_mask_andnot_ps'. Requires AVX512DQ.

func M512MaskBroadcastF32x2

func M512MaskBroadcastF32x2(src x86.M512, k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskBroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm512_mask_broadcast_f32x2'. Requires AVX512DQ.

func M512MaskBroadcastF32x8

func M512MaskBroadcastF32x8(src x86.M512, k x86.Mmask16, a x86.M256) (dst x86.M512)

M512MaskBroadcastF32x8: Broadcast the 8 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X8'. Intrinsic: '_mm512_mask_broadcast_f32x8'. Requires AVX512DQ.

func M512MaskBroadcastF64x2

func M512MaskBroadcastF64x2(src x86.M512d, k x86.Mmask8, a x86.M128d) (dst x86.M512d)

M512MaskBroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm512_mask_broadcast_f64x2'. Requires AVX512DQ.

func M512MaskBroadcastI32x2

func M512MaskBroadcastI32x2(src x86.M512i, k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm512_mask_broadcast_i32x2'. Requires AVX512DQ.

func M512MaskBroadcastI32x8

func M512MaskBroadcastI32x8(src x86.M512i, k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskBroadcastI32x8: Broadcast the 8 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X8'. Intrinsic: '_mm512_mask_broadcast_i32x8'. Requires AVX512DQ.

func M512MaskBroadcastI64x2

func M512MaskBroadcastI64x2(src x86.M512i, k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := src[n+63:n]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm512_mask_broadcast_i64x2'. Requires AVX512DQ.

func M512MaskCvtRoundepi64Pd

func M512MaskCvtRoundepi64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)

M512MaskCvtRoundepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_mask_cvt_roundepi64_pd'. Requires AVX512DQ.

func M512MaskCvtRoundepi64Ps

func M512MaskCvtRoundepi64Ps(src x86.M256, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)

M512MaskCvtRoundepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
			ELSE
				dst[l+31:l] := src[l+31:l]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_mask_cvt_roundepi64_ps'. Requires AVX512DQ.

func M512MaskCvtRoundepu64Pd

func M512MaskCvtRoundepu64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)

M512MaskCvtRoundepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_mask_cvt_roundepu64_pd'. Requires AVX512DQ.

func M512MaskCvtRoundepu64Ps

func M512MaskCvtRoundepu64Ps(src x86.M256, k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)

M512MaskCvtRoundepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
			ELSE
				dst[l+31:l] := src[l+31:l]
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_mask_cvt_roundepu64_ps'. Requires AVX512DQ.

func M512MaskCvtRoundpdEpi64

func M512MaskCvtRoundpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)

M512MaskCvtRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_mask_cvt_roundpd_epi64'. Requires AVX512DQ.

func M512MaskCvtRoundpdEpu64

func M512MaskCvtRoundpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)

M512MaskCvtRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_mask_cvt_roundpd_epu64'. Requires AVX512DQ.

func M512MaskCvtRoundpsEpi64

func M512MaskCvtRoundpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)

M512MaskCvtRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_mask_cvt_roundps_epi64'. Requires AVX512DQ.

func M512MaskCvtRoundpsEpu64

func M512MaskCvtRoundpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)

M512MaskCvtRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_mask_cvt_roundps_epu64'. Requires AVX512DQ.

func M512MaskCvtepi64Pd

func M512MaskCvtepi64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i) (dst x86.M512d)

M512MaskCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_mask_cvtepi64_pd'. Requires AVX512DQ.

func M512MaskCvtepi64Ps

func M512MaskCvtepi64Ps(src x86.M256, k x86.Mmask8, a x86.M512i) (dst x86.M256)

M512MaskCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_mask_cvtepi64_ps'. Requires AVX512DQ.

func M512MaskCvtepu64Pd

func M512MaskCvtepu64Pd(src x86.M512d, k x86.Mmask8, a x86.M512i) (dst x86.M512d)

M512MaskCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_mask_cvtepu64_pd'. Requires AVX512DQ.

func M512MaskCvtepu64Ps

func M512MaskCvtepu64Ps(src x86.M256, k x86.Mmask8, a x86.M512i) (dst x86.M256)

M512MaskCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_mask_cvtepu64_ps'. Requires AVX512DQ.

func M512MaskCvtpdEpi64

func M512MaskCvtpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_mask_cvtpd_epi64'. Requires AVX512DQ.

func M512MaskCvtpdEpu64

func M512MaskCvtpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_mask_cvtpd_epu64'. Requires AVX512DQ.

func M512MaskCvtpsEpi64

func M512MaskCvtpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_mask_cvtps_epi64'. Requires AVX512DQ.

func M512MaskCvtpsEpu64

func M512MaskCvtpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_mask_cvtps_epu64'. Requires AVX512DQ.

func M512MaskCvttRoundpdEpi64

func M512MaskCvttRoundpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)

M512MaskCvttRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_mask_cvtt_roundpd_epi64'. Requires AVX512DQ.

func M512MaskCvttRoundpdEpu64

func M512MaskCvttRoundpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)

M512MaskCvttRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		IF k[j]
			dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_mask_cvtt_roundpd_epu64'. Requires AVX512DQ.

func M512MaskCvttRoundpsEpi64

func M512MaskCvttRoundpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)

M512MaskCvttRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		l := j*32
		IF k[j]
			dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_mask_cvtt_roundps_epi64'. Requires AVX512DQ.

func M512MaskCvttRoundpsEpu64

func M512MaskCvttRoundpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)

M512MaskCvttRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

	FOR j := 0 to 7
		i := j*64
		l := j*32
		IF k[j]
			dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
		ELSE
			dst[i+63:i] := src[i+63:i]
		FI
	ENDFOR
	dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_mask_cvtt_roundps_epu64'. Requires AVX512DQ.

func M512MaskCvttpdEpi64

func M512MaskCvttpdEpi64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_mask_cvttpd_epi64'. Requires AVX512DQ.

func M512MaskCvttpdEpu64

func M512MaskCvttpdEpu64(src x86.M512i, k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_mask_cvttpd_epu64'. Requires AVX512DQ.

func M512MaskCvttpsEpi64

func M512MaskCvttpsEpi64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_mask_cvttps_epi64'. Requires AVX512DQ.

func M512MaskCvttpsEpu64

func M512MaskCvttpsEpu64(src x86.M512i, k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_mask_cvttps_epu64'. Requires AVX512DQ.

func M512MaskExtractf32x8Ps

func M512MaskExtractf32x8Ps(src x86.M256, k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M256)

M512MaskExtractf32x8Ps: Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTF32X8'. Intrinsic: '_mm512_mask_extractf32x8_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskExtractf64x2Pd

func M512MaskExtractf64x2Pd(src x86.M128d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M128d)

M512MaskExtractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm512_mask_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskExtracti32x8Epi32

func M512MaskExtracti32x8Epi32(src x86.M256i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)

M512MaskExtracti32x8Epi32: Extract 256 bits (composed of 8 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTI32X8'. Intrinsic: '_mm512_mask_extracti32x8_epi32'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskExtracti64x2Epi64

func M512MaskExtracti64x2Epi64(src x86.M128i, k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)

M512MaskExtracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm512_mask_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskFpclassPdMask

func M512MaskFpclassPdMask(k1 x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.Mmask8)

M512MaskFpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 7
			i := j*64
			IF k1[j]
				k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:8] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm512_mask_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskFpclassPsMask

func M512MaskFpclassPsMask(k1 x86.Mmask16, a x86.M512, imm8 byte) (dst x86.Mmask16)

M512MaskFpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 15
			i := j*32
			IF k1[j]
				k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:16] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm512_mask_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskInsertf32x8

func M512MaskInsertf32x8(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M256, imm8 byte) (dst x86.M512)

M512MaskInsertf32x8: Copy 'a' to 'tmp', then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF32X8'. Intrinsic: '_mm512_mask_insertf32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskInsertf64x2

func M512MaskInsertf64x2(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M128d, imm8 byte) (dst x86.M512d)

M512MaskInsertf64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm512_mask_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskInserti32x8

func M512MaskInserti32x8(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512MaskInserti32x8: Copy 'a' to 'tmp', then insert 256 bits (composed of 8 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI32X8'. Intrinsic: '_mm512_mask_inserti32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskInserti64x2

func M512MaskInserti64x2(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512MaskInserti64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm512_mask_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskMulloEpi64

func M512MaskMulloEpi64(src x86.M512i, k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm512_mask_mullo_epi64'. Requires AVX512DQ.

func M512MaskOrPd

func M512MaskOrPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPD'. Intrinsic: '_mm512_mask_or_pd'. Requires AVX512DQ.

func M512MaskOrPs

func M512MaskOrPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPS'. Intrinsic: '_mm512_mask_or_ps'. Requires AVX512DQ.

func M512MaskRangePd

func M512MaskRangePd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_mask_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskRangePs

func M512MaskRangePs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_mask_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskRangeRoundPd

func M512MaskRangeRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskRangeRoundPd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_mask_range_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskRangeRoundPs

func M512MaskRangeRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskRangeRoundPs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[63:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_mask_range_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskReducePd

func M512MaskReducePd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_mask_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskReducePs

func M512MaskReducePs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_mask_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskReduceRoundPd

func M512MaskReduceRoundPd(src x86.M512d, k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskReduceRoundPd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := src[i+63:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_mask_reduce_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskReduceRoundPs

func M512MaskReduceRoundPs(src x86.M512, k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskReduceRoundPs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}
		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := src[i+31:i]
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_mask_reduce_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskXorPd

func M512MaskXorPd(src x86.M512d, k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm512_mask_xor_pd'. Requires AVX512DQ.

func M512MaskXorPs

func M512MaskXorPs(src x86.M512, k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm512_mask_xor_ps'. Requires AVX512DQ.

func M512MaskzAndPd

func M512MaskzAndPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm512_maskz_and_pd'. Requires AVX512DQ.

func M512MaskzAndPs

func M512MaskzAndPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm512_maskz_and_ps'. Requires AVX512DQ.

func M512MaskzAndnotPd

func M512MaskzAndnotPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm512_maskz_andnot_pd'. Requires AVX512DQ.

func M512MaskzAndnotPs

func M512MaskzAndnotPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm512_maskz_andnot_ps'. Requires AVX512DQ.

func M512MaskzBroadcastF32x2

func M512MaskzBroadcastF32x2(k x86.Mmask16, a x86.M128) (dst x86.M512)

M512MaskzBroadcastF32x2: Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X2'. Intrinsic: '_mm512_maskz_broadcast_f32x2'. Requires AVX512DQ.

func M512MaskzBroadcastF32x8

func M512MaskzBroadcastF32x8(k x86.Mmask16, a x86.M256) (dst x86.M512)

M512MaskzBroadcastF32x8: Broadcast the 8 packed single-precision (32-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF32X8'. Intrinsic: '_mm512_maskz_broadcast_f32x8'. Requires AVX512DQ.

func M512MaskzBroadcastF64x2

func M512MaskzBroadcastF64x2(k x86.Mmask8, a x86.M128d) (dst x86.M512d)

M512MaskzBroadcastF64x2: Broadcast the 2 packed double-precision (64-bit) floating-point elements from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTF64X2'. Intrinsic: '_mm512_maskz_broadcast_f64x2'. Requires AVX512DQ.

func M512MaskzBroadcastI32x2

func M512MaskzBroadcastI32x2(k x86.Mmask16, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm512_maskz_broadcast_i32x2'. Requires AVX512DQ.

func M512MaskzBroadcastI32x8

func M512MaskzBroadcastI32x8(k x86.Mmask16, a x86.M256i) (dst x86.M512i)

M512MaskzBroadcastI32x8: Broadcast the 8 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	n := (j mod 8)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI32X8'. Intrinsic: '_mm512_maskz_broadcast_i32x8'. Requires AVX512DQ.

func M512MaskzBroadcastI64x2

func M512MaskzBroadcastI64x2(k x86.Mmask8, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastI64x2: Broadcast the 2 packed 64-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	n := (j mod 2)*64
	IF k[j]
		dst[i+63:i] := a[n+63:n]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VBROADCASTI64X2'. Intrinsic: '_mm512_maskz_broadcast_i64x2'. Requires AVX512DQ.

func M512MaskzCvtRoundepi64Pd

func M512MaskzCvtRoundepi64Pd(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)

M512MaskzCvtRoundepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_maskz_cvt_roundepi64_pd'. Requires AVX512DQ.

func M512MaskzCvtRoundepi64Ps

func M512MaskzCvtRoundepi64Ps(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)

M512MaskzCvtRoundepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
			ELSE
				dst[l+31:l] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_maskz_cvt_roundepi64_ps'. Requires AVX512DQ.

func M512MaskzCvtRoundepu64Pd

func M512MaskzCvtRoundepu64Pd(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M512d)

M512MaskzCvtRoundepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_maskz_cvt_roundepu64_pd'. Requires AVX512DQ.

func M512MaskzCvtRoundepu64Ps

func M512MaskzCvtRoundepu64Ps(k x86.Mmask8, a x86.M512i, rounding int) (dst x86.M256)

M512MaskzCvtRoundepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
			ELSE
				dst[l+31:l] := 0
			FI
		ENDFOR
		dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_maskz_cvt_roundepu64_ps'. Requires AVX512DQ.

func M512MaskzCvtRoundpdEpi64

func M512MaskzCvtRoundpdEpi64(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_maskz_cvt_roundpd_epi64'. Requires AVX512DQ.

func M512MaskzCvtRoundpdEpu64

func M512MaskzCvtRoundpdEpu64(k x86.Mmask8, a x86.M512d, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_maskz_cvt_roundpd_epu64'. Requires AVX512DQ.

func M512MaskzCvtRoundpsEpi64

func M512MaskzCvtRoundpsEpi64(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_maskz_cvt_roundps_epi64'. Requires AVX512DQ.

func M512MaskzCvtRoundpsEpu64

func M512MaskzCvtRoundpsEpu64(k x86.Mmask8, a x86.M256, rounding int) (dst x86.M512i)

M512MaskzCvtRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		FOR j := 0 to 7
			i := j*64
			l := j*32
			IF k[j]
				dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_maskz_cvt_roundps_epu64'. Requires AVX512DQ.

func M512MaskzCvtepi64Pd

func M512MaskzCvtepi64Pd(k x86.Mmask8, a x86.M512i) (dst x86.M512d)

M512MaskzCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm512_maskz_cvtepi64_pd'. Requires AVX512DQ.

func M512MaskzCvtepi64Ps

func M512MaskzCvtepi64Ps(k x86.Mmask8, a x86.M512i) (dst x86.M256)

M512MaskzCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm512_maskz_cvtepi64_ps'. Requires AVX512DQ.

func M512MaskzCvtepu64Pd

func M512MaskzCvtepu64Pd(k x86.Mmask8, a x86.M512i) (dst x86.M512d)

M512MaskzCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm512_maskz_cvtepu64_pd'. Requires AVX512DQ.

func M512MaskzCvtepu64Ps

func M512MaskzCvtepu64Ps(k x86.Mmask8, a x86.M512i) (dst x86.M256)

M512MaskzCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm512_maskz_cvtepu64_ps'. Requires AVX512DQ.

func M512MaskzCvtpdEpi64

func M512MaskzCvtpdEpi64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskzCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm512_maskz_cvtpd_epi64'. Requires AVX512DQ.

func M512MaskzCvtpdEpu64

func M512MaskzCvtpdEpu64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskzCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm512_maskz_cvtpd_epu64'. Requires AVX512DQ.

func M512MaskzCvtpsEpi64

func M512MaskzCvtpsEpi64(k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskzCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm512_maskz_cvtps_epi64'. Requires AVX512DQ.

func M512MaskzCvtpsEpu64

func M512MaskzCvtpsEpu64(k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskzCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm512_maskz_cvtps_epu64'. Requires AVX512DQ.

func M512MaskzCvttRoundpdEpi64

func M512MaskzCvttRoundpdEpi64(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)

M512MaskzCvttRoundpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_maskz_cvtt_roundpd_epi64'. Requires AVX512DQ.

func M512MaskzCvttRoundpdEpu64

func M512MaskzCvttRoundpdEpu64(k x86.Mmask8, a x86.M512d, sae int) (dst x86.M512i)

M512MaskzCvttRoundpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_maskz_cvtt_roundpd_epu64'. Requires AVX512DQ.

func M512MaskzCvttRoundpsEpi64

func M512MaskzCvttRoundpsEpi64(k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)

M512MaskzCvttRoundpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_maskz_cvtt_roundps_epi64'. Requires AVX512DQ.

func M512MaskzCvttRoundpsEpu64

func M512MaskzCvttRoundpsEpu64(k x86.Mmask8, a x86.M256, sae int) (dst x86.M512i)

M512MaskzCvttRoundpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to 'sae' to suppress all exceptions.

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_maskz_cvtt_roundps_epu64'. Requires AVX512DQ.

func M512MaskzCvttpdEpi64

func M512MaskzCvttpdEpi64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskzCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm512_maskz_cvttpd_epi64'. Requires AVX512DQ.

func M512MaskzCvttpdEpu64

func M512MaskzCvttpdEpu64(k x86.Mmask8, a x86.M512d) (dst x86.M512i)

M512MaskzCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm512_maskz_cvttpd_epu64'. Requires AVX512DQ.

func M512MaskzCvttpsEpi64

func M512MaskzCvttpsEpi64(k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskzCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm512_maskz_cvttps_epi64'. Requires AVX512DQ.

func M512MaskzCvttpsEpu64

func M512MaskzCvttpsEpu64(k x86.Mmask8, a x86.M256) (dst x86.M512i)

M512MaskzCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm512_maskz_cvttps_epu64'. Requires AVX512DQ.

func M512MaskzExtractf32x8Ps

func M512MaskzExtractf32x8Ps(k x86.Mmask8, a x86.M512, imm8 byte) (dst x86.M256)

M512MaskzExtractf32x8Ps: Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTF32X8'. Intrinsic: '_mm512_maskz_extractf32x8_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtractf64x2Pd

func M512MaskzExtractf64x2Pd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M128d)

M512MaskzExtractf64x2Pd: Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTF64X2'. Intrinsic: '_mm512_maskz_extractf64x2_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtracti32x8Epi32

func M512MaskzExtracti32x8Epi32(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M256i)

M512MaskzExtracti32x8Epi32: Extract 256 bits (composed of 8 packed 32-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VEXTRACTI32X8'. Intrinsic: '_mm512_maskz_extracti32x8_epi32'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzExtracti64x2Epi64

func M512MaskzExtracti64x2Epi64(k x86.Mmask8, a x86.M512i, imm8 byte) (dst x86.M128i)

M512MaskzExtracti64x2Epi64: Extract 128 bits (composed of 2 packed 64-bit integers) from 'a', selected with 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VEXTRACTI64X2'. Intrinsic: '_mm512_maskz_extracti64x2_epi64'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzInsertf32x8

func M512MaskzInsertf32x8(k x86.Mmask16, a x86.M512, b x86.M256, imm8 byte) (dst x86.M512)

M512MaskzInsertf32x8: Copy 'a' to 'tmp', then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF32X8'. Intrinsic: '_mm512_maskz_insertf32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzInsertf64x2

func M512MaskzInsertf64x2(k x86.Mmask8, a x86.M512d, b x86.M128d, imm8 byte) (dst x86.M512d)

M512MaskzInsertf64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTF64X2'. Intrinsic: '_mm512_maskz_insertf64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzInserti32x8

func M512MaskzInserti32x8(k x86.Mmask16, a x86.M512i, b x86.M256i, imm8 byte) (dst x86.M512i)

M512MaskzInserti32x8: Copy 'a' to 'tmp', then insert 256 bits (composed of 8 packed 32-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := tmp[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI32X8'. Intrinsic: '_mm512_maskz_inserti32x8'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzInserti64x2

func M512MaskzInserti64x2(k x86.Mmask8, a x86.M512i, b x86.M128i, imm8 byte) (dst x86.M512i)

M512MaskzInserti64x2: Copy 'a' to 'tmp', then insert 128 bits (composed of 2 packed 64-bit integers) from 'b' into 'tmp' at the location specified by 'imm8'. Store 'tmp' to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := tmp[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VINSERTI64X2'. Intrinsic: '_mm512_maskz_inserti64x2'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzMulloEpi64

func M512MaskzMulloEpi64(k x86.Mmask8, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm512_maskz_mullo_epi64'. Requires AVX512DQ.

func M512MaskzOrPd

func M512MaskzOrPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPD'. Intrinsic: '_mm512_maskz_or_pd'. Requires AVX512DQ.

func M512MaskzOrPs

func M512MaskzOrPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPS'. Intrinsic: '_mm512_maskz_or_ps'. Requires AVX512DQ.

func M512MaskzRangePd

func M512MaskzRangePd(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_maskz_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzRangePs

func M512MaskzRangePs(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512MaskzRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_maskz_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzRangeRoundPd

func M512MaskzRangeRoundPd(k x86.Mmask8, a x86.M512d, b x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskzRangeRoundPd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_maskz_range_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzRangeRoundPs

func M512MaskzRangeRoundPs(k x86.Mmask16, a x86.M512, b x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskzRangeRoundPs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[63:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_maskz_range_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzReducePd

func M512MaskzReducePd(k x86.Mmask8, a x86.M512d, imm8 byte) (dst x86.M512d)

M512MaskzReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_maskz_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzReducePs

func M512MaskzReducePs(k x86.Mmask16, a x86.M512, imm8 byte) (dst x86.M512)

M512MaskzReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_maskz_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzReduceRoundPd

func M512MaskzReduceRoundPd(k x86.Mmask8, a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512MaskzReduceRoundPd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			IF k[j]
				dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
			ELSE
				dst[i+63:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_maskz_reduce_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzReduceRoundPs

func M512MaskzReduceRoundPs(k x86.Mmask16, a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512MaskzReduceRoundPs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}
		FOR j := 0 to 15
			i := j*32
			IF k[j]
				dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
			ELSE
				dst[i+31:i] := 0
			FI
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_maskz_reduce_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512MaskzXorPd

func M512MaskzXorPd(k x86.Mmask8, a x86.M512d, b x86.M512d) (dst x86.M512d)

M512MaskzXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm512_maskz_xor_pd'. Requires AVX512DQ.

func M512MaskzXorPs

func M512MaskzXorPs(k x86.Mmask16, a x86.M512, b x86.M512) (dst x86.M512)

M512MaskzXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm512_maskz_xor_ps'. Requires AVX512DQ.

func M512Movepi32Mask

func M512Movepi32Mask(a x86.M512i) (dst x86.Mmask16)

M512Movepi32Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 32-bit integer in 'a'.

FOR j := 0 to 15
	i := j*32
	IF a[i+31]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPMOVD2M'. Intrinsic: '_mm512_movepi32_mask'. Requires AVX512DQ.

func M512Movepi64Mask

func M512Movepi64Mask(a x86.M512i) (dst x86.Mmask8)

M512Movepi64Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 64-bit integer in 'a'.

FOR j := 0 to 7
	i := j*64
	IF a[i+63]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPMOVQ2M'. Intrinsic: '_mm512_movepi64_mask'. Requires AVX512DQ.

func M512MovmEpi32

func M512MovmEpi32(k x86.Mmask16) (dst x86.M512i)

M512MovmEpi32: Set each packed 32-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := 0xFFFFFFFF
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVM2D'. Intrinsic: '_mm512_movm_epi32'. Requires AVX512DQ.

func M512MovmEpi64

func M512MovmEpi64(k x86.Mmask8) (dst x86.M512i)

M512MovmEpi64: Set each packed 64-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 7
	i := j*64
	IF k[j]
		dst[i+63:i] := 0xFFFFFFFFffffffff
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVM2Q'. Intrinsic: '_mm512_movm_epi64'. Requires AVX512DQ.

func M512MulloEpi64

func M512MulloEpi64(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst'.

FOR j := 0 to 7
	i := j*64
	tmp[127:0] := a[i+63:i] * b[i+63:i]
	dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm512_mullo_epi64'. Requires AVX512DQ.

func M512OrPd

func M512OrPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512OrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPD'. Intrinsic: '_mm512_or_pd'. Requires AVX512DQ.

func M512OrPs

func M512OrPs(a x86.M512, b x86.M512) (dst x86.M512)

M512OrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VORPS'. Intrinsic: '_mm512_or_ps'. Requires AVX512DQ.

func M512RangePd

func M512RangePd(a x86.M512d, b x86.M512d, imm8 byte) (dst x86.M512d)

M512RangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512RangePs

func M512RangePs(a x86.M512, b x86.M512, imm8 byte) (dst x86.M512)

M512RangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512RangeRoundPd

func M512RangeRoundPd(a x86.M512d, b x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512RangeRoundPd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm512_range_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512RangeRoundPs

func M512RangeRoundPs(a x86.M512, b x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512RangeRoundPs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[63:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm512_range_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512ReducePd

func M512ReducePd(a x86.M512d, imm8 byte) (dst x86.M512d)

M512ReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512ReducePs

func M512ReducePs(a x86.M512, imm8 byte) (dst x86.M512)

M512ReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512ReduceRoundPd

func M512ReduceRoundPd(a x86.M512d, imm8 byte, rounding int) (dst x86.M512d)

M512ReduceRoundPd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		FOR j := 0 to 7
			i := j*64
			dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm512_reduce_round_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512ReduceRoundPs

func M512ReduceRoundPs(a x86.M512, imm8 byte, rounding int) (dst x86.M512)

M512ReduceRoundPs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}
		FOR j := 0 to 15
			i := j*32
			dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
		ENDFOR
		dst[MAX:512] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm512_reduce_round_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func M512XorPd

func M512XorPd(a x86.M512d, b x86.M512d) (dst x86.M512d)

M512XorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*64
	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm512_xor_pd'. Requires AVX512DQ.

func M512XorPs

func M512XorPs(a x86.M512, b x86.M512) (dst x86.M512)

M512XorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm512_xor_ps'. Requires AVX512DQ.

func MaskAndPd

func MaskAndPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm_mask_and_pd'. Requires AVX512DQ.

func MaskAndPs

func MaskAndPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm_mask_and_ps'. Requires AVX512DQ.

func MaskAndnotPd

func MaskAndnotPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm_mask_andnot_pd'. Requires AVX512DQ.

func MaskAndnotPs

func MaskAndnotPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm_mask_andnot_ps'. Requires AVX512DQ.

func MaskBroadcastI32x2

func MaskBroadcastI32x2(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := src[n+31:n]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm_mask_broadcast_i32x2'. Requires AVX512DQ.

func MaskCvtepi64Pd

func MaskCvtepi64Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm_mask_cvtepi64_pd'. Requires AVX512DQ.

func MaskCvtepi64Ps

func MaskCvtepi64Ps(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm_mask_cvtepi64_ps'. Requires AVX512DQ.

func MaskCvtepu64Pd

func MaskCvtepu64Pd(src x86.M128d, k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm_mask_cvtepu64_pd'. Requires AVX512DQ.

func MaskCvtepu64Ps

func MaskCvtepu64Ps(src x86.M128, k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := src[l+31:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm_mask_cvtepu64_ps'. Requires AVX512DQ.

func MaskCvtpdEpi64

func MaskCvtpdEpi64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm_mask_cvtpd_epi64'. Requires AVX512DQ.

func MaskCvtpdEpu64

func MaskCvtpdEpu64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm_mask_cvtpd_epu64'. Requires AVX512DQ.

func MaskCvtpsEpi64

func MaskCvtpsEpi64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm_mask_cvtps_epi64'. Requires AVX512DQ.

func MaskCvtpsEpu64

func MaskCvtpsEpu64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm_mask_cvtps_epu64'. Requires AVX512DQ.

func MaskCvttpdEpi64

func MaskCvttpdEpi64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm_mask_cvttpd_epi64'. Requires AVX512DQ.

func MaskCvttpdEpu64

func MaskCvttpdEpu64(src x86.M128i, k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm_mask_cvttpd_epu64'. Requires AVX512DQ.

func MaskCvttpsEpi64

func MaskCvttpsEpi64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm_mask_cvttps_epi64'. Requires AVX512DQ.

func MaskCvttpsEpu64

func MaskCvttpsEpu64(src x86.M128i, k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm_mask_cvttps_epu64'. Requires AVX512DQ.

func MaskFpclassPdMask

func MaskFpclassPdMask(k1 x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.Mmask8)

MaskFpclassPdMask: Test packed double-precision (64-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 1
			i := j*64
			IF k1[j]
				k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:2] := 0

Instruction: 'VFPCLASSPD'. Intrinsic: '_mm_mask_fpclass_pd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskFpclassPsMask

func MaskFpclassPsMask(k1 x86.Mmask8, a x86.M128, imm8 byte) (dst x86.Mmask8)

MaskFpclassPsMask: Test packed single-precision (32-bit) floating-point elements in 'a' for special categories specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		FOR j := 0 to 3
			i := j*32
			IF k1[j]
				k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
			ELSE
				k[j] := 0
			FI
		ENDFOR
		k[MAX:4] := 0

Instruction: 'VFPCLASSPS'. Intrinsic: '_mm_mask_fpclass_ps_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskFpclassSdMask

func MaskFpclassSdMask(k1 x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.Mmask8)

MaskFpclassSdMask: Test the lower double-precision (64-bit) floating-point element in 'a' for special categories specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		IF k1[0]
			k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
		ELSE
			k[0] := 0
		FI
		k[MAX:1] := 0

Instruction: 'VFPCLASSSD'. Intrinsic: '_mm_mask_fpclass_sd_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskFpclassSsMask

func MaskFpclassSsMask(k1 x86.Mmask8, a x86.M128, imm8 byte) (dst x86.Mmask8)

MaskFpclassSsMask: Test the lower single-precision (32-bit) floating-point element in 'a' for special categories specified by 'imm8', and store the result in mask vector 'k' using zeromask 'k1' (the element is zeroed out when mask bit 0 is not set).

	'imm' can be a combination of:
    0x01 // QNaN
    0x02 // Positive Zero
    0x04 // Negative Zero
    0x08 // Positive Infinity
    0x10 // Negative Infinity
    0x20 // Denormal
    0x40 // Negative
    0x80 // SNaN

		IF k1[0]
			k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
		ELSE
			k[0] := 0
		FI
		k[MAX:1] := 0

Instruction: 'VFPCLASSSS'. Intrinsic: '_mm_mask_fpclass_ss_mask'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskMulloEpi64

func MaskMulloEpi64(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm_mask_mullo_epi64'. Requires AVX512DQ.

func MaskOrPd

func MaskOrPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VORPD'. Intrinsic: '_mm_mask_or_pd'. Requires AVX512DQ.

func MaskOrPs

func MaskOrPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VORPS'. Intrinsic: '_mm_mask_or_ps'. Requires AVX512DQ.

func MaskRangePd

func MaskRangePd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm_mask_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskRangePs

func MaskRangePs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm_mask_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskRangeRoundSd

func MaskRangeRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskRangeRoundSd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		IF k[0]
			dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VRANGESD'. Intrinsic: '_mm_mask_range_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskRangeRoundSs

func MaskRangeRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskRangeRoundSs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[31:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		IF k[0]
			dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VRANGESS'. Intrinsic: '_mm_mask_range_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskRangeSd

func MaskRangeSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskRangeSd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

IF k[0]
	dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRANGESD'. Intrinsic: '_mm_mask_range_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskRangeSs

func MaskRangeSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskRangeSs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[31:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

IF k[0]
	dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRANGESS'. Intrinsic: '_mm_mask_range_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReducePd

func MaskReducePd(src x86.M128d, k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm_mask_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReducePs

func MaskReducePs(src x86.M128, k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm_mask_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReduceRoundSd

func MaskReduceRoundSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskReduceRoundSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		IF k[0]
			dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
		ELSE
			dst[63:0] := src[63:0]
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_mask_reduce_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReduceRoundSs

func MaskReduceRoundSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskReduceRoundSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}

		IF k[0]
			dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
		ELSE
			dst[31:0] := src[31:0]
		FI
		dst[127:64] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_mask_reduce_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReduceSd

func MaskReduceSd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskReduceSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

IF k[0]
	dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
ELSE
	dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_mask_reduce_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskReduceSs

func MaskReduceSs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskReduceSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using writemask 'k' (the element is copied from 'src' when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}

IF k[0]
	dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
ELSE
	dst[31:0] := src[31:0]
FI
dst[127:64] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_mask_reduce_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskXorPd

func MaskXorPd(src x86.M128d, k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm_mask_xor_pd'. Requires AVX512DQ.

func MaskXorPs

func MaskXorPs(src x86.M128, k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm_mask_xor_ps'. Requires AVX512DQ.

func MaskzAndPd

func MaskzAndPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzAndPd: Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDPD'. Intrinsic: '_mm_maskz_and_pd'. Requires AVX512DQ.

func MaskzAndPs

func MaskzAndPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzAndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDPS'. Intrinsic: '_mm_maskz_and_ps'. Requires AVX512DQ.

func MaskzAndnotPd

func MaskzAndnotPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzAndnotPd: Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDNPD'. Intrinsic: '_mm_maskz_andnot_pd'. Requires AVX512DQ.

func MaskzAndnotPs

func MaskzAndnotPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzAndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VANDNPS'. Intrinsic: '_mm_maskz_andnot_ps'. Requires AVX512DQ.

func MaskzBroadcastI32x2

func MaskzBroadcastI32x2(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzBroadcastI32x2: Broadcast the lower 2 packed 32-bit integers from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	n := (j mod 2)*32
	IF k[j]
		dst[i+31:i] := a[n+31:n]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTI32X2'. Intrinsic: '_mm_maskz_broadcast_i32x2'. Requires AVX512DQ.

func MaskzCvtepi64Pd

func MaskzCvtepi64Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskzCvtepi64Pd: Convert packed 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTQQ2PD'. Intrinsic: '_mm_maskz_cvtepi64_pd'. Requires AVX512DQ.

func MaskzCvtepi64Ps

func MaskzCvtepi64Ps(k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskzCvtepi64Ps: Convert packed 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTQQ2PS'. Intrinsic: '_mm_maskz_cvtepi64_ps'. Requires AVX512DQ.

func MaskzCvtepu64Pd

func MaskzCvtepu64Pd(k x86.Mmask8, a x86.M128i) (dst x86.M128d)

MaskzCvtepu64Pd: Convert packed unsigned 64-bit integers in 'a' to packed double-precision (64-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTUQQ2PD'. Intrinsic: '_mm_maskz_cvtepu64_pd'. Requires AVX512DQ.

func MaskzCvtepu64Ps

func MaskzCvtepu64Ps(k x86.Mmask8, a x86.M128i) (dst x86.M128)

MaskzCvtepu64Ps: Convert packed unsigned 64-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
	ELSE
		dst[l+31:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VCVTUQQ2PS'. Intrinsic: '_mm_maskz_cvtepu64_ps'. Requires AVX512DQ.

func MaskzCvtpdEpi64

func MaskzCvtpdEpi64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvtpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2QQ'. Intrinsic: '_mm_maskz_cvtpd_epi64'. Requires AVX512DQ.

func MaskzCvtpdEpu64

func MaskzCvtpdEpu64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvtpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPD2UQQ'. Intrinsic: '_mm_maskz_cvtpd_epu64'. Requires AVX512DQ.

func MaskzCvtpsEpi64

func MaskzCvtpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvtpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2QQ'. Intrinsic: '_mm_maskz_cvtps_epi64'. Requires AVX512DQ.

func MaskzCvtpsEpu64

func MaskzCvtpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvtpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTPS2UQQ'. Intrinsic: '_mm_maskz_cvtps_epu64'. Requires AVX512DQ.

func MaskzCvttpdEpi64

func MaskzCvttpdEpi64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvttpdEpi64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2QQ'. Intrinsic: '_mm_maskz_cvttpd_epi64'. Requires AVX512DQ.

func MaskzCvttpdEpu64

func MaskzCvttpdEpu64(k x86.Mmask8, a x86.M128d) (dst x86.M128i)

MaskzCvttpdEpu64: Convert packed double-precision (64-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPD2UQQ'. Intrinsic: '_mm_maskz_cvttpd_epu64'. Requires AVX512DQ.

func MaskzCvttpsEpi64

func MaskzCvttpsEpi64(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvttpsEpi64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2QQ'. Intrinsic: '_mm_maskz_cvttps_epi64'. Requires AVX512DQ.

func MaskzCvttpsEpu64

func MaskzCvttpsEpu64(k x86.Mmask8, a x86.M128) (dst x86.M128i)

MaskzCvttpsEpu64: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed unsigned 64-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	l := j*32
	IF k[j]
		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VCVTTPS2UQQ'. Intrinsic: '_mm_maskz_cvttps_epu64'. Requires AVX512DQ.

func MaskzMulloEpi64

func MaskzMulloEpi64(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		tmp[127:0] := a[i+63:i] * b[i+63:i]
		dst[i+63:i] := tmp[63:0]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm_maskz_mullo_epi64'. Requires AVX512DQ.

func MaskzOrPd

func MaskzOrPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzOrPd: Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VORPD'. Intrinsic: '_mm_maskz_or_pd'. Requires AVX512DQ.

func MaskzOrPs

func MaskzOrPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzOrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VORPS'. Intrinsic: '_mm_maskz_or_ps'. Requires AVX512DQ.

func MaskzRangePd

func MaskzRangePd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskzRangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm_maskz_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzRangePs

func MaskzRangePs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskzRangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm_maskz_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzRangeRoundSd

func MaskzRangeRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskzRangeRoundSd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		IF k[0]
			dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VRANGESD'. Intrinsic: '_mm_maskz_range_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzRangeRoundSs

func MaskzRangeRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskzRangeRoundSs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[31:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		IF k[0]
			dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VRANGESS'. Intrinsic: '_mm_maskz_range_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzRangeSd

func MaskzRangeSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskzRangeSd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'a' to the upper element of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

IF k[0]
	dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0

Instruction: 'VRANGESD'. Intrinsic: '_mm_maskz_range_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzRangeSs

func MaskzRangeSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskzRangeSs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[31:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

IF k[0]
	dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
ELSE
	dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'VRANGESS'. Intrinsic: '_mm_maskz_range_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReducePd

func MaskzReducePd(k x86.Mmask8, a x86.M128d, imm8 byte) (dst x86.M128d)

MaskzReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm_maskz_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReducePs

func MaskzReducePs(k x86.Mmask8, a x86.M128, imm8 byte) (dst x86.M128)

MaskzReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm_maskz_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReduceRoundSd

func MaskzReduceRoundSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

MaskzReduceRoundSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		IF k[0]
			dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
		ELSE
			dst[63:0] := 0
		FI
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_maskz_reduce_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReduceRoundSs

func MaskzReduceRoundSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

MaskzReduceRoundSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}

		IF k[0]
			dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
		ELSE
			dst[31:0] := 0
		FI
		dst[127:64] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_maskz_reduce_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReduceSd

func MaskzReduceSd(k x86.Mmask8, a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

MaskzReduceSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper element from 'b' to the upper element of 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

IF k[0]
	dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
ELSE
	dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_maskz_reduce_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzReduceSs

func MaskzReduceSs(k x86.Mmask8, a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

MaskzReduceSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst' using zeromask 'k' (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}

IF k[0]
	dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
ELSE
	dst[31:0] := 0
FI
dst[127:64] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_maskz_reduce_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func MaskzXorPd

func MaskzXorPd(k x86.Mmask8, a x86.M128d, b x86.M128d) (dst x86.M128d)

MaskzXorPd: Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VXORPD'. Intrinsic: '_mm_maskz_xor_pd'. Requires AVX512DQ.

func MaskzXorPs

func MaskzXorPs(k x86.Mmask8, a x86.M128, b x86.M128) (dst x86.M128)

MaskzXorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VXORPS'. Intrinsic: '_mm_maskz_xor_ps'. Requires AVX512DQ.

func Movepi32Mask

func Movepi32Mask(a x86.M128i) (dst x86.Mmask8)

Movepi32Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 32-bit integer in 'a'.

FOR j := 0 to 3
	i := j*32
	IF a[i+31]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:4] := 0

Instruction: 'VPMOVD2M'. Intrinsic: '_mm_movepi32_mask'. Requires AVX512DQ.

func Movepi64Mask

func Movepi64Mask(a x86.M128i) (dst x86.Mmask8)

Movepi64Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 64-bit integer in 'a'.

FOR j := 0 to 1
	i := j*64
	IF a[i+63]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:2] := 0

Instruction: 'VPMOVQ2M'. Intrinsic: '_mm_movepi64_mask'. Requires AVX512DQ.

func MovmEpi32

func MovmEpi32(k x86.Mmask8) (dst x86.M128i)

MovmEpi32: Set each packed 32-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := 0xFFFFFFFF
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVM2D'. Intrinsic: '_mm_movm_epi32'. Requires AVX512DQ.

func MovmEpi64

func MovmEpi64(k x86.Mmask8) (dst x86.M128i)

MovmEpi64: Set each packed 64-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 1
	i := j*64
	IF k[j]
		dst[i+63:i] := 0xFFFFFFFFffffffff
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVM2Q'. Intrinsic: '_mm_movm_epi64'. Requires AVX512DQ.

func MulloEpi64

func MulloEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

MulloEpi64: Multiply the packed 64-bit integers in 'a' and 'b', producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in 'dst'.

FOR j := 0 to 1
	i := j*64
	tmp[127:0] := a[i+63:i] * b[i+63:i]
	dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLQ'. Intrinsic: '_mm_mullo_epi64'. Requires AVX512DQ.

func RangePd

func RangePd(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

RangePd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
	1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
	2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
	3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
	1: dst[63:0] := tmp[63:0]
	2: dst[63:0] := (0 << 63) OR (tmp[62:0])
	3: dst[63:0] := (1 << 63) OR (tmp[62:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPD'. Intrinsic: '_mm_range_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func RangePs

func RangePs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

RangePs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
	CASE opCtl[1:0]
	0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
	1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
	2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
	3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
	ESAC

	CASE signSelCtl[1:0]
	0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
	1: dst[31:0] := tmp[63:0]
	2: dst[31:0] := (0 << 31) OR (tmp[30:0])
	3: dst[31:0] := (1 << 31) OR (tmp[30:0])
	ESAC

	RETURN dst
}

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VRANGEPS'. Intrinsic: '_mm_range_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func RangeRoundSd

func RangeRoundSd(a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

RangeRoundSd: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower double-precision (64-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
			1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
			2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
			3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
			1: dst[63:0] := tmp[63:0]
			2: dst[63:0] := (0 << 63) OR (tmp[62:0])
			3: dst[63:0] := (1 << 63) OR (tmp[62:0])
			ESAC

			RETURN dst
		}

		dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
		dst[127:64] := a[127:64]
		dst[MAX:128] := 0

Instruction: 'VRANGESD'. Intrinsic: '_mm_range_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func RangeRoundSs

func RangeRoundSs(a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

RangeRoundSs: Calculate the max, min, absolute max, or absolute min (depending on control in 'imm8') for the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 =

absolute max, 11 = absolute min.

imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from

compare result, 10 = clear sign bit, 11 = set sign bit.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
		{
			CASE opCtl[1:0]
			0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
			1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
			2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
			3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
			ESAC

			CASE signSelCtl[1:0]
			0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
			1: dst[31:0] := tmp[31:0]
			2: dst[31:0] := (0 << 31) OR (tmp[30:0])
			3: dst[31:0] := (1 << 31) OR (tmp[30:0])
			ESAC

			RETURN dst
		}

		dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
		dst[127:32] := a[127:32]
		dst[MAX:128] := 0

Instruction: 'VRANGESS'. Intrinsic: '_mm_range_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReducePd

func ReducePd(a x86.M128d, imm8 byte) (dst x86.M128d)

ReducePd: Extract the reduced argument of packed double-precision (64-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPD'. Intrinsic: '_mm_reduce_pd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReducePs

func ReducePs(a x86.M128, imm8 byte) (dst x86.M128)

ReducePs: Extract the reduced argument of packed single-precision (32-bit) floating-point elements in 'a' by the number of bits specified by 'imm8', and store the results in 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VREDUCEPS'. Intrinsic: '_mm_reduce_ps'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReduceRoundSd

func ReduceRoundSd(a x86.M128d, b x86.M128d, imm8 byte, rounding int) (dst x86.M128d)

ReduceRoundSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPD(src1[63:0], imm8[7:0])
		{
			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
			tmp[63:0] := src1[63:0] - tmp[63:0]
			RETURN tmp[63:0]
		}

		dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
		dst[127:64] := b[127:64]
		dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_reduce_round_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReduceRoundSs

func ReduceRoundSs(a x86.M128, b x86.M128, imm8 byte, rounding int) (dst x86.M128)

ReduceRoundSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

Rounding is done according to the 'rounding' parameter, which can be one

of:

    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE

		ReduceArgumentPS(src1[31:0], imm8[7:0])
		{
			IF src1[31:0] == NAN
				RETURN (convert src1[31:0] to QNaN)
			FI

			m := imm8[7:4] // number of fraction bits after the binary point to be preserved
			rc := imm8[1:0] // round control
			rc_src := imm8[2] // round ccontrol source
			spe := 0
			tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
			tmp[31:0] := src1[31:0] - tmp[31:0]
			RETURN tmp[31:0]
		}

		dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
		dst[127:64] := b[127:32]
		dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_reduce_round_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReduceSd

func ReduceSd(a x86.M128d, b x86.M128d, imm8 byte) (dst x86.M128d)

ReduceSd: Extract the reduced argument of the lower double-precision (64-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper element from 'b' to the upper element of 'dst'.

ReduceArgumentPD(src1[63:0], imm8[7:0])
{
	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
	tmp[63:0] := src1[63:0] - tmp[63:0]
	RETURN tmp[63:0]
}

dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0

Instruction: 'VREDUCESD'. Intrinsic: '_mm_reduce_sd'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

func ReduceSs

func ReduceSs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

ReduceSs: Extract the reduced argument of the lower single-precision (32-bit) floating-point element in 'a' by the number of bits specified by 'imm8', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'b' to the upper elements of 'dst'.

ReduceArgumentPS(src1[31:0], imm8[7:0])
{
	IF src1[31:0] == NAN
		RETURN (convert src1[31:0] to QNaN)
	FI

	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
	rc := imm8[1:0] // round control
	rc_src := imm8[2] // round ccontrol source
	spe := 0
	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
	tmp[31:0] := src1[31:0] - tmp[31:0]
	RETURN tmp[31:0]
}

dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
dst[127:64] := b[127:32]
dst[MAX:128] := 0

Instruction: 'VREDUCESS'. Intrinsic: '_mm_reduce_ss'. Requires AVX512DQ.

FIXME: Requires compiler support (has immediate)

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL