sse

package

v0.0.0-...-3878f85 Latest Latest Go to latest Published: Jul 23, 2017 License: MIT Imports: 1 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/klauspost/intrinsics

Links

Open Source Insights

Documentation ¶

Overview ¶

THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!

THEY DO NOT NOT CONTAIN WORKING INTRINSICS!

See https://github.com/klauspost/intrinsics

Index ¶

func AcosPd(a x86.M128d) (dst x86.M128d)
func AcosPs(a x86.M128) (dst x86.M128)
func AcoshPd(a x86.M128d) (dst x86.M128d)
func AcoshPs(a x86.M128) (dst x86.M128)
func AddPs(a x86.M128, b x86.M128) (dst x86.M128)
func AddSs(a x86.M128, b x86.M128) (dst x86.M128)
func AndPs(a x86.M128, b x86.M128) (dst x86.M128)
func AndnotPs(a x86.M128, b x86.M128) (dst x86.M128)
func AsinPd(a x86.M128d) (dst x86.M128d)
func AsinPs(a x86.M128) (dst x86.M128)
func AsinhPd(a x86.M128d) (dst x86.M128d)
func AsinhPs(a x86.M128) (dst x86.M128)
func Atan2Pd(a x86.M128d, b x86.M128d) (dst x86.M128d)
func Atan2Ps(a x86.M128, b x86.M128) (dst x86.M128)
func AtanPd(a x86.M128d) (dst x86.M128d)
func AtanPs(a x86.M128) (dst x86.M128)
func AtanhPd(a x86.M128d) (dst x86.M128d)
func AtanhPs(a x86.M128) (dst x86.M128)
func AvgPu16(a x86.M64, b x86.M64) (dst x86.M64)
func AvgPu8(a x86.M64, b x86.M64) (dst x86.M64)
func CbrtPd(a x86.M128d) (dst x86.M128d)
func CbrtPs(a x86.M128) (dst x86.M128)
func CdfnormPd(a x86.M128d) (dst x86.M128d)
func CdfnormPs(a x86.M128) (dst x86.M128)
func CdfnorminvPd(a x86.M128d) (dst x86.M128d)
func CdfnorminvPs(a x86.M128) (dst x86.M128)
func CexpPs(a x86.M128) (dst x86.M128)
func ClogPs(a x86.M128) (dst x86.M128)
func CmpeqPs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpeqSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpgePs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpgeSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpgtPs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpgtSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmplePs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpleSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpltPs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpltSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpneqPs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpneqSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpngePs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpngeSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpngtPs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpngtSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpnlePs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpnleSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpnltPs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpnltSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpordPs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpordSs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpunordPs(a x86.M128, b x86.M128) (dst x86.M128)
func CmpunordSs(a x86.M128, b x86.M128) (dst x86.M128)
func ComieqSs(a x86.M128, b x86.M128) int
func ComigeSs(a x86.M128, b x86.M128) int
func ComigtSs(a x86.M128, b x86.M128) int
func ComileSs(a x86.M128, b x86.M128) int
func ComiltSs(a x86.M128, b x86.M128) int
func ComineqSs(a x86.M128, b x86.M128) int
func CosPd(a x86.M128d) (dst x86.M128d)
func CosPs(a x86.M128) (dst x86.M128)
func CosdPd(a x86.M128d) (dst x86.M128d)
func CosdPs(a x86.M128) (dst x86.M128)
func CoshPd(a x86.M128d) (dst x86.M128d)
func CoshPs(a x86.M128) (dst x86.M128)
func CsqrtPs(a x86.M128) (dst x86.M128)
func CvtPi2ps(a x86.M128, b x86.M64) (dst x86.M128)
func CvtPs2pi(a x86.M128) (dst x86.M64)
func CvtSi2ss(a x86.M128, b int) (dst x86.M128)
func CvtSs2si(a x86.M128) int
func Cvtpi16Ps(a x86.M64) (dst x86.M128)
func Cvtpi32Ps(a x86.M128, b x86.M64) (dst x86.M128)
func Cvtpi32x2Ps(a x86.M64, b x86.M64) (dst x86.M128)
func Cvtpi8Ps(a x86.M64) (dst x86.M128)
func CvtpsPi16(a x86.M128) (dst x86.M64)
func CvtpsPi32(a x86.M128) (dst x86.M64)
func CvtpsPi8(a x86.M128) (dst x86.M64)
func Cvtpu16Ps(a x86.M64) (dst x86.M128)
func Cvtpu8Ps(a x86.M64) (dst x86.M128)
func Cvtsi32Ss(a x86.M128, b int) (dst x86.M128)
func Cvtsi64Ss(a x86.M128, b int64) (dst x86.M128)
func CvtssF32(a x86.M128) float32
func CvtssSi32(a x86.M128) int
func CvtssSi64(a x86.M128) int64
func CvttPs2pi(a x86.M128) (dst x86.M64)
func CvttSs2si(a x86.M128) int
func CvttpsPi32(a x86.M128) (dst x86.M64)
func CvttssSi32(a x86.M128) int
func CvttssSi64(a x86.M128) int64
func DivEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i)
func DivEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)
func DivEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func DivEpi8(a x86.M128i, b x86.M128i) (dst x86.M128i)
func DivEpu16(a x86.M128i, b x86.M128i) (dst x86.M128i)
func DivEpu32(a x86.M128i, b x86.M128i) (dst x86.M128i)
func DivEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func DivEpu8(a x86.M128i, b x86.M128i) (dst x86.M128i)
func DivPs(a x86.M128, b x86.M128) (dst x86.M128)
func DivSs(a x86.M128, b x86.M128) (dst x86.M128)
func ErfPd(a x86.M128d) (dst x86.M128d)
func ErfPs(a x86.M128) (dst x86.M128)
func ErfcPd(a x86.M128d) (dst x86.M128d)
func ErfcPs(a x86.M128) (dst x86.M128)
func ErfcinvPd(a x86.M128d) (dst x86.M128d)
func ErfcinvPs(a x86.M128) (dst x86.M128)
func ErfinvPd(a x86.M128d) (dst x86.M128d)
func ErfinvPs(a x86.M128) (dst x86.M128)
func Exp10Pd(a x86.M128d) (dst x86.M128d)
func Exp10Ps(a x86.M128) (dst x86.M128)
func Exp2Pd(a x86.M128d) (dst x86.M128d)
func Exp2Ps(a x86.M128) (dst x86.M128)
func ExpPd(a x86.M128d) (dst x86.M128d)
func ExpPs(a x86.M128) (dst x86.M128)
func Expm1Pd(a x86.M128d) (dst x86.M128d)
func Expm1Ps(a x86.M128) (dst x86.M128)
func ExtractPi16(a x86.M64, imm8 byte) int
func Getcsr() uint32
func HypotPd(a x86.M128d, b x86.M128d) (dst x86.M128d)
func HypotPs(a x86.M128, b x86.M128) (dst x86.M128)
func IdivEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)
func IdivremEpi32(mem_addr *x86.M128i, a x86.M128i, b x86.M128i) (dst x86.M128i)
func InsertPi16(a x86.M64, i int, imm8 byte) (dst x86.M64)
func InvcbrtPd(a x86.M128d) (dst x86.M128d)
func InvcbrtPs(a x86.M128) (dst x86.M128)
func InvsqrtPd(a x86.M128d) (dst x86.M128d)
func InvsqrtPs(a x86.M128) (dst x86.M128)
func IremEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)
func LoadhPi(a x86.M128, mem_addr *x86.M64Const) (dst x86.M128)
func LoadlPi(a x86.M128, mem_addr *x86.M64Const) (dst x86.M128)
func Log10Pd(a x86.M128d) (dst x86.M128d)
func Log10Ps(a x86.M128) (dst x86.M128)
func Log1pPd(a x86.M128d) (dst x86.M128d)
func Log1pPs(a x86.M128) (dst x86.M128)
func Log2Pd(a x86.M128d) (dst x86.M128d)
func Log2Ps(a x86.M128) (dst x86.M128)
func LogPd(a x86.M128d) (dst x86.M128d)
func LogPs(a x86.M128) (dst x86.M128)
func LogbPd(a x86.M128d) (dst x86.M128d)
func LogbPs(a x86.M128) (dst x86.M128)
func MMGETEXCEPTIONMASK() uint32
func MMGETEXCEPTIONSTATE() uint32
func MMGETFLUSHZEROMODE() uint32
func MMGETROUNDINGMODE() uint32
func MMSETEXCEPTIONMASK(a uint32)
func MMSETEXCEPTIONSTATE(a uint32)
func MMSETFLUSHZEROMODE(a uint32)
func MMSETROUNDINGMODE(a uint32)
func MMTRANSPOSE4PS(row0 x86.M128, row1 x86.M128, row2 x86.M128, row3 x86.M128)
func MaskmoveSi64(a x86.M64, mask x86.M64, mem_addr *byte)
func Maskmovq(a x86.M64, mask x86.M64, mem_addr *byte)
func MaxPi16(a x86.M64, b x86.M64) (dst x86.M64)
func MaxPs(a x86.M128, b x86.M128) (dst x86.M128)
func MaxPu8(a x86.M64, b x86.M64) (dst x86.M64)
func MaxSs(a x86.M128, b x86.M128) (dst x86.M128)
func MinPi16(a x86.M64, b x86.M64) (dst x86.M64)
func MinPs(a x86.M128, b x86.M128) (dst x86.M128)
func MinPu8(a x86.M64, b x86.M64) (dst x86.M64)
func MinSs(a x86.M128, b x86.M128) (dst x86.M128)
func MoveSs(a x86.M128, b x86.M128) (dst x86.M128)
func MovehlPs(a x86.M128, b x86.M128) (dst x86.M128)
func MovelhPs(a x86.M128, b x86.M128) (dst x86.M128)
func MovemaskPi8(a x86.M64) int
func MovemaskPs(a x86.M128) int
func MulPs(a x86.M128, b x86.M128) (dst x86.M128)
func MulSs(a x86.M128, b x86.M128) (dst x86.M128)
func MulhiPu16(a x86.M64, b x86.M64) (dst x86.M64)
func OrPs(a x86.M128, b x86.M128) (dst x86.M128)
func Pavgb(a x86.M64, b x86.M64) (dst x86.M64)
func Pavgw(a x86.M64, b x86.M64) (dst x86.M64)
func Pextrw(a x86.M64, imm8 byte) int
func Pinsrw(a x86.M64, i int, imm8 byte) (dst x86.M64)
func Pmaxsw(a x86.M64, b x86.M64) (dst x86.M64)
func Pmaxub(a x86.M64, b x86.M64) (dst x86.M64)
func Pminsw(a x86.M64, b x86.M64) (dst x86.M64)
func Pminub(a x86.M64, b x86.M64) (dst x86.M64)
func Pmovmskb(a x86.M64) int
func Pmulhuw(a x86.M64, b x86.M64) (dst x86.M64)
func PowPd(a x86.M128d, b x86.M128d) (dst x86.M128d)
func PowPs(a x86.M128, b x86.M128) (dst x86.M128)
func Prefetch(p *byte, i int)
func Psadbw(a x86.M64, b x86.M64) (dst x86.M64)
func Pshufw(a x86.M64, imm8 byte) (dst x86.M64)
func RcpPs(a x86.M128) (dst x86.M128)
func RcpSs(a x86.M128) (dst x86.M128)
func RemEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RemEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RemEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RemEpi8(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RemEpu16(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RemEpu32(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RemEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RemEpu8(a x86.M128i, b x86.M128i) (dst x86.M128i)
func RsqrtPs(a x86.M128) (dst x86.M128)
func RsqrtSs(a x86.M128) (dst x86.M128)
func SadPu8(a x86.M64, b x86.M64) (dst x86.M64)
func Set1Ps(a float32) (dst x86.M128)
func SetPs(e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M128)
func SetPs1(a float32) (dst x86.M128)
func SetSs(a float32) (dst x86.M128)
func Setcsr(a uint32)
func SetrPs(e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M128)
func SetzeroPs() (dst x86.M128)
func Sfence()
func ShufflePi16(a x86.M64, imm8 byte) (dst x86.M64)
func ShufflePs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)
func SinPd(a x86.M128d) (dst x86.M128d)
func SinPs(a x86.M128) (dst x86.M128)
func SincosPd(mem_addr *x86.M128d, a x86.M128d) (dst x86.M128d)
func SincosPs(mem_addr *x86.M128, a x86.M128) (dst x86.M128)
func SindPd(a x86.M128d) (dst x86.M128d)
func SindPs(a x86.M128) (dst x86.M128)
func SinhPd(a x86.M128d) (dst x86.M128d)
func SinhPs(a x86.M128) (dst x86.M128)
func SqrtPs(a x86.M128) (dst x86.M128)
func SqrtSs(a x86.M128) (dst x86.M128)
func Store1Ps(mem_addr *float32, a x86.M128)
func StorePs(mem_addr *float32, a x86.M128)
func StorePs1(mem_addr *float32, a x86.M128)
func StoreSs(mem_addr *float32, a x86.M128)
func StorehPi(mem_addr *x86.M64, a x86.M128)
func StorelPi(mem_addr *x86.M64, a x86.M128)
func StorerPs(mem_addr *float32, a x86.M128)
func StoreuPs(mem_addr *float32, a x86.M128)
func StreamPi(mem_addr *x86.M64, a x86.M64)
func StreamPs(mem_addr *float32, a x86.M128)
func SubPs(a x86.M128, b x86.M128) (dst x86.M128)
func SubSs(a x86.M128, b x86.M128) (dst x86.M128)
func SvmlCeilPd(a x86.M128d) (dst x86.M128d)
func SvmlCeilPs(a x86.M128) (dst x86.M128)
func SvmlFloorPd(a x86.M128d) (dst x86.M128d)
func SvmlFloorPs(a x86.M128) (dst x86.M128)
func SvmlRoundPd(a x86.M128d) (dst x86.M128d)
func SvmlRoundPs(a x86.M128) (dst x86.M128)
func SvmlSqrtPd(a x86.M128d) (dst x86.M128d)
func SvmlSqrtPs(a x86.M128) (dst x86.M128)
func TanPd(a x86.M128d) (dst x86.M128d)
func TanPs(a x86.M128) (dst x86.M128)
func TandPd(a x86.M128d) (dst x86.M128d)
func TandPs(a x86.M128) (dst x86.M128)
func TanhPd(a x86.M128d) (dst x86.M128d)
func TanhPs(a x86.M128) (dst x86.M128)
func TruncPd(a x86.M128d) (dst x86.M128d)
func TruncPs(a x86.M128) (dst x86.M128)
func UcomieqSs(a x86.M128, b x86.M128) int
func UcomigeSs(a x86.M128, b x86.M128) int
func UcomigtSs(a x86.M128, b x86.M128) int
func UcomileSs(a x86.M128, b x86.M128) int
func UcomiltSs(a x86.M128, b x86.M128) int
func UcomineqSs(a x86.M128, b x86.M128) int
func UdivEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)
func UdivremEpi32(mem_addr *x86.M128i, a x86.M128i, b x86.M128i) (dst x86.M128i)
func UnpackhiPs(a x86.M128, b x86.M128) (dst x86.M128)
func UnpackloPs(a x86.M128, b x86.M128) (dst x86.M128)
func UremEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)
func XorPs(a x86.M128, b x86.M128) (dst x86.M128)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func AcosPd ¶

func AcosPd(a x86.M128d) (dst x86.M128d)

AcosPd: Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_acos_pd'. Requires SSE.

func AcosPs ¶

func AcosPs(a x86.M128) (dst x86.M128)

AcosPs: Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_acos_ps'. Requires SSE.

func AcoshPd ¶

func AcoshPd(a x86.M128d) (dst x86.M128d)

AcoshPd: Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_acosh_pd'. Requires SSE.

func AcoshPs ¶

func AcoshPs(a x86.M128) (dst x86.M128)

AcoshPs: Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_acosh_ps'. Requires SSE.

func AddPs ¶

func AddPs(a x86.M128, b x86.M128) (dst x86.M128)

AddPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR

Instruction: 'ADDPS'. Intrinsic: '_mm_add_ps'. Requires SSE.

func AddSs ¶

func AddSs(a x86.M128, b x86.M128) (dst x86.M128)

AddSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := a[31:0] + b[31:0]
dst[127:32] := a[127:32]

Instruction: 'ADDSS'. Intrinsic: '_mm_add_ss'. Requires SSE.

func AndPs ¶

func AndPs(a x86.M128, b x86.M128) (dst x86.M128)

AndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ENDFOR

Instruction: 'ANDPS'. Intrinsic: '_mm_and_ps'. Requires SSE.

func AndnotPs ¶

func AndnotPs(a x86.M128, b x86.M128) (dst x86.M128)

AndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ENDFOR

Instruction: 'ANDNPS'. Intrinsic: '_mm_andnot_ps'. Requires SSE.

func AsinPd ¶

func AsinPd(a x86.M128d) (dst x86.M128d)

AsinPd: Compute the inverse sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ASIN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_asin_pd'. Requires SSE.

func AsinPs ¶

func AsinPs(a x86.M128) (dst x86.M128)

AsinPs: Compute the inverse sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ASIN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_asin_ps'. Requires SSE.

func AsinhPd ¶

func AsinhPd(a x86.M128d) (dst x86.M128d)

AsinhPd: Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ASINH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_asinh_pd'. Requires SSE.

func AsinhPs ¶

func AsinhPs(a x86.M128) (dst x86.M128)

AsinhPs: Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ASINH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_asinh_ps'. Requires SSE.

func Atan2Pd ¶

func Atan2Pd(a x86.M128d, b x86.M128d) (dst x86.M128d)

Atan2Pd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atan2_pd'. Requires SSE.

func Atan2Ps ¶

func Atan2Ps(a x86.M128, b x86.M128) (dst x86.M128)

Atan2Ps: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atan2_ps'. Requires SSE.

func AtanPd ¶

func AtanPd(a x86.M128d) (dst x86.M128d)

AtanPd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ATAN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atan_pd'. Requires SSE.

func AtanPs ¶

func AtanPs(a x86.M128) (dst x86.M128)

AtanPs: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ATAN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atan_ps'. Requires SSE.

func AtanhPd ¶

func AtanhPd(a x86.M128d) (dst x86.M128d)

AtanhPd: Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ATANH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atanh_pd'. Requires SSE.

func AtanhPs ¶

func AtanhPs(a x86.M128) (dst x86.M128)

AtanhPs: Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ATANH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atanh_ps'. Requires SSE.

func AvgPu16 ¶

func AvgPu16(a x86.M64, b x86.M64) (dst x86.M64)

AvgPu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR

Instruction: 'PAVGW'. Intrinsic: '_mm_avg_pu16'. Requires SSE.

func AvgPu8 ¶

func AvgPu8(a x86.M64, b x86.M64) (dst x86.M64)

AvgPu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR

Instruction: 'PAVGB'. Intrinsic: '_mm_avg_pu8'. Requires SSE.

func CbrtPd ¶

func CbrtPd(a x86.M128d) (dst x86.M128d)

CbrtPd: Compute the cube root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := CubeRoot(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cbrt_pd'. Requires SSE.

func CbrtPs ¶

func CbrtPs(a x86.M128) (dst x86.M128)

CbrtPs: Compute the cube root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := CubeRoot(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cbrt_ps'. Requires SSE.

func CdfnormPd ¶

func CdfnormPd(a x86.M128d) (dst x86.M128d)

CdfnormPd: Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := CDFNormal(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cdfnorm_pd'. Requires SSE.

func CdfnormPs ¶

func CdfnormPs(a x86.M128) (dst x86.M128)

CdfnormPs: Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := CDFNormal(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cdfnorm_ps'. Requires SSE.

func CdfnorminvPd ¶

func CdfnorminvPd(a x86.M128d) (dst x86.M128d)

CdfnorminvPd: Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cdfnorminv_pd'. Requires SSE.

func CdfnorminvPs ¶

func CdfnorminvPs(a x86.M128) (dst x86.M128)

CdfnorminvPs: Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cdfnorminv_ps'. Requires SSE.

func CexpPs ¶

func CexpPs(a x86.M128) (dst x86.M128)

CexpPs: Compute the exponential value of 'e' raised to the power of packed complex single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cexp_ps'. Requires SSE.

func ClogPs ¶

func ClogPs(a x86.M128) (dst x86.M128)

ClogPs: Compute the natural logarithm of packed complex single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_clog_ps'. Requires SSE.

func CmpeqPs ¶

func CmpeqPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpeqPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for equality, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpeq_ps'. Requires SSE.

func CmpeqSs ¶

func CmpeqSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpeqSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for equality, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpeq_ss'. Requires SSE.

func CmpgePs ¶

func CmpgePs(a x86.M128, b x86.M128) (dst x86.M128)

CmpgePs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for greater-than-or-equal, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] >= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpge_ps'. Requires SSE.

func CmpgeSs ¶

func CmpgeSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpgeSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for greater-than-or-equal, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] >= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpge_ss'. Requires SSE.

func CmpgtPs ¶

func CmpgtPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpgtPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for greater-than, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpgt_ps'. Requires SSE.

func CmpgtSs ¶

func CmpgtSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpgtSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for greater-than, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] > b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpgt_ss'. Requires SSE.

func CmplePs ¶

func CmplePs(a x86.M128, b x86.M128) (dst x86.M128)

CmplePs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than-or-equal, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] <= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmple_ps'. Requires SSE.

func CmpleSs ¶

func CmpleSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpleSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than-or-equal, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] <= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmple_ss'. Requires SSE.

func CmpltPs ¶

func CmpltPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpltPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmplt_ps'. Requires SSE.

func CmpltSs ¶

func CmpltSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpltSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] < b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmplt_ss'. Requires SSE.

func CmpneqPs ¶

func CmpneqPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpneqPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-equal, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpneq_ps'. Requires SSE.

func CmpneqSs ¶

func CmpneqSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpneqSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for not-equal, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpneq_ss'. Requires SSE.

func CmpngePs ¶

func CmpngePs(a x86.M128, b x86.M128) (dst x86.M128)

CmpngePs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-greater-than-or-equal, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] >= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpnge_ps'. Requires SSE.

func CmpngeSs ¶

func CmpngeSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpngeSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for not-greater-than-or-equal, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := !( a[31:0] >= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpnge_ss'. Requires SSE.

func CmpngtPs ¶

func CmpngtPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpngtPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-greater-than, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] > b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpngt_ps'. Requires SSE.

func CmpngtSs ¶

func CmpngtSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpngtSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for not-greater-than, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := !( a[31:0] > b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpngt_ss'. Requires SSE.

func CmpnlePs ¶

func CmpnlePs(a x86.M128, b x86.M128) (dst x86.M128)

CmpnlePs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than-or-equal, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] <= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpnle_ps'. Requires SSE.

func CmpnleSs ¶

func CmpnleSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpnleSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than-or-equal, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := !( a[31:0] <= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpnle_ss'. Requires SSE.

func CmpnltPs ¶

func CmpnltPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpnltPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] < b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpnlt_ps'. Requires SSE.

func CmpnltSs ¶

func CmpnltSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpnltSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := !( a[31:0] < b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpnlt_ss'. Requires SSE.

func CmpordPs ¶

func CmpordPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpordPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' to see if neither is NaN, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpord_ps'. Requires SSE.

func CmpordSs ¶

func CmpordSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpordSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' to see if neither is NaN, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpord_ss'. Requires SSE.

func CmpunordPs ¶

func CmpunordPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpunordPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' to see if either is NaN, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] != NaN OR b[i+31:i] != NaN ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpunord_ps'. Requires SSE.

func CmpunordSs ¶

func CmpunordSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpunordSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' to see if either is NaN, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] != NaN OR b[31:0] != NaN ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpunord_ss'. Requires SSE.

func ComieqSs ¶

func ComieqSs(a x86.M128, b x86.M128) int

ComieqSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for equality, and return the boolean result (0 or 1).

RETURN ( a[31:0] == b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comieq_ss'. Requires SSE.

func ComigeSs ¶

func ComigeSs(a x86.M128, b x86.M128) int

ComigeSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for greater-than-or-equal, and return the boolean result (0 or 1).

RETURN ( a[31:0] >= b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comige_ss'. Requires SSE.

func ComigtSs ¶

func ComigtSs(a x86.M128, b x86.M128) int

ComigtSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for greater-than, and return the boolean result (0 or 1).

RETURN ( a[31:0] > b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comigt_ss'. Requires SSE.

func ComileSs ¶

func ComileSs(a x86.M128, b x86.M128) int

ComileSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for less-than-or-equal, and return the boolean result (0 or 1).

RETURN ( a[31:0] <= b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comile_ss'. Requires SSE.

func ComiltSs ¶

func ComiltSs(a x86.M128, b x86.M128) int

ComiltSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for less-than, and return the boolean result (0 or 1).

RETURN ( a[31:0] < b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comilt_ss'. Requires SSE.

func ComineqSs ¶

func ComineqSs(a x86.M128, b x86.M128) int

ComineqSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for not-equal, and return the boolean result (0 or 1).

RETURN ( a[31:0] != b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comineq_ss'. Requires SSE.

func CosPd ¶

func CosPd(a x86.M128d) (dst x86.M128d)

CosPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cos_pd'. Requires SSE.

func CosPs ¶

func CosPs(a x86.M128) (dst x86.M128)

CosPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cos_ps'. Requires SSE.

func CosdPd ¶

func CosdPd(a x86.M128d) (dst x86.M128d)

CosdPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := COSD(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cosd_pd'. Requires SSE.

func CosdPs ¶

func CosdPs(a x86.M128) (dst x86.M128)

CosdPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := COSD(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cosd_ps'. Requires SSE.

func CoshPd ¶

func CoshPd(a x86.M128d) (dst x86.M128d)

CoshPd: Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := COSH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cosh_pd'. Requires SSE.

func CoshPs ¶

func CoshPs(a x86.M128) (dst x86.M128)

CoshPs: Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := COSH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cosh_ps'. Requires SSE.

func CsqrtPs ¶

func CsqrtPs(a x86.M128) (dst x86.M128)

CsqrtPs: Compute the square root of packed complex single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_csqrt_ps'. Requires SSE.

func CvtPi2ps ¶

func CvtPi2ps(a x86.M128, b x86.M64) (dst x86.M128)

CvtPi2ps: Convert packed 32-bit integers in 'b' to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of 'dst', and copy the upper 2 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[63:32] := Convert_Int32_To_FP32(b[63:32])
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]

Instruction: 'CVTPI2PS'. Intrinsic: '_mm_cvt_pi2ps'. Requires SSE.

func CvtPs2pi ¶

func CvtPs2pi(a x86.M128) (dst x86.M64)

CvtPs2pi: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR

Instruction: 'CVTPS2PI'. Intrinsic: '_mm_cvt_ps2pi'. Requires SSE.

func CvtSi2ss ¶

func CvtSi2ss(a x86.M128, b int) (dst x86.M128)

CvtSi2ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]

Instruction: 'CVTSI2SS'. Intrinsic: '_mm_cvt_si2ss'. Requires SSE.

func CvtSs2si ¶

func CvtSs2si(a x86.M128) int

CvtSs2si: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32(a[31:0])

Instruction: 'CVTSS2SI'. Intrinsic: '_mm_cvt_ss2si'. Requires SSE.

func Cvtpi16Ps ¶

func Cvtpi16Ps(a x86.M64) (dst x86.M128)

Cvtpi16Ps: Convert packed 16-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*16
	m := j*32
	dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtpi16_ps'. Requires SSE.

func Cvtpi32Ps ¶

func Cvtpi32Ps(a x86.M128, b x86.M64) (dst x86.M128)

Cvtpi32Ps: Convert packed 32-bit integers in 'b' to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of 'dst', and copy the upper 2 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[63:32] := Convert_Int32_To_FP32(b[63:32])
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]

Instruction: 'CVTPI2PS'. Intrinsic: '_mm_cvtpi32_ps'. Requires SSE.

func Cvtpi32x2Ps ¶

func Cvtpi32x2Ps(a x86.M64, b x86.M64) (dst x86.M128)

Cvtpi32x2Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of 'dst', then covert the packed 32-bit integers in 'a' to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(a[31:0])
dst[63:32] := Convert_Int32_To_FP32(a[63:32])
dst[95:64] := Convert_Int32_To_FP32(b[31:0])
dst[127:96] := Convert_Int32_To_FP32(b[63:32])

Instruction: '...'. Intrinsic: '_mm_cvtpi32x2_ps'. Requires SSE.

func Cvtpi8Ps ¶

func Cvtpi8Ps(a x86.M64) (dst x86.M128)

Cvtpi8Ps: Convert the lower packed 8-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*8
	m := j*32
	dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtpi8_ps'. Requires SSE.

func CvtpsPi16 ¶

func CvtpsPi16(a x86.M128) (dst x86.M64)

CvtpsPi16: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 16-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := 16*j
	k := 32*j
	dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtps_pi16'. Requires SSE.

func CvtpsPi32 ¶

func CvtpsPi32(a x86.M128) (dst x86.M64)

CvtpsPi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR

Instruction: 'CVTPS2PI'. Intrinsic: '_mm_cvtps_pi32'. Requires SSE.

func CvtpsPi8 ¶

func CvtpsPi8(a x86.M128) (dst x86.M64)

CvtpsPi8: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 8-bit integers, and store the results in lower 4 elements of 'dst'.

FOR j := 0 to 3
	i := 8*j
	k := 32*j
	dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtps_pi8'. Requires SSE.

func Cvtpu16Ps ¶

func Cvtpu16Ps(a x86.M64) (dst x86.M128)

Cvtpu16Ps: Convert packed unsigned 16-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*16
	m := j*32
	dst[m+31:m] := Convert_UnsignedInt16_To_FP32(a[i+15:i])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtpu16_ps'. Requires SSE.

func Cvtpu8Ps ¶

func Cvtpu8Ps(a x86.M64) (dst x86.M128)

Cvtpu8Ps: Convert the lower packed unsigned 8-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*8
	m := j*32
	dst[m+31:m] := Convert_UnsignedInt8_To_FP32(a[i+7:i])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtpu8_ps'. Requires SSE.

func Cvtsi32Ss ¶

func Cvtsi32Ss(a x86.M128, b int) (dst x86.M128)

Cvtsi32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]

Instruction: 'CVTSI2SS'. Intrinsic: '_mm_cvtsi32_ss'. Requires SSE.

func Cvtsi64Ss ¶

func Cvtsi64Ss(a x86.M128, b int64) (dst x86.M128)

Cvtsi64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'CVTSI2SS'. Intrinsic: '_mm_cvtsi64_ss'. Requires SSE.

func CvtssF32 ¶

func CvtssF32(a x86.M128) float32

CvtssF32: Copy the lower single-precision (32-bit) floating-point element of 'a' to 'dst'.

dst[31:0] := a[31:0]

Instruction: 'MOVSS'. Intrinsic: '_mm_cvtss_f32'. Requires SSE.

func CvtssSi32 ¶

func CvtssSi32(a x86.M128) int

CvtssSi32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32(a[31:0])

Instruction: 'CVTSS2SI'. Intrinsic: '_mm_cvtss_si32'. Requires SSE.

func CvtssSi64 ¶

func CvtssSi64(a x86.M128) int64

CvtssSi64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

dst[63:0] := Convert_FP32_To_Int64(a[31:0])

Instruction: 'CVTSS2SI'. Intrinsic: '_mm_cvtss_si64'. Requires SSE.

func CvttPs2pi ¶

func CvttPs2pi(a x86.M128) (dst x86.M64)

CvttPs2pi: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR

Instruction: 'CVTTPS2PI'. Intrinsic: '_mm_cvtt_ps2pi'. Requires SSE.

func CvttSs2si ¶

func CvttSs2si(a x86.M128) int

CvttSs2si: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])

Instruction: 'CVTTSS2SI'. Intrinsic: '_mm_cvtt_ss2si'. Requires SSE.

func CvttpsPi32 ¶

func CvttpsPi32(a x86.M128) (dst x86.M64)

CvttpsPi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR

Instruction: 'CVTTPS2PI'. Intrinsic: '_mm_cvttps_pi32'. Requires SSE.

func CvttssSi32 ¶

func CvttssSi32(a x86.M128) int

CvttssSi32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])

Instruction: 'CVTTSS2SI'. Intrinsic: '_mm_cvttss_si32'. Requires SSE.

func CvttssSi64 ¶

func CvttssSi64(a x86.M128) int64

CvttssSi64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

dst[63:0] := Convert_FP64_To_Int32_Truncate(a[31:0])

Instruction: 'CVTTSS2SI'. Intrinsic: '_mm_cvttss_si64'. Requires SSE.

func DivEpi16 ¶

func DivEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpi16: Divide packed 16-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epi16'. Requires SSE.

func DivEpi32 ¶

func DivEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epi32'. Requires SSE.

func DivEpi64 ¶

func DivEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpi64: Divide packed 64-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epi64'. Requires SSE.

func DivEpi8 ¶

func DivEpi8(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpi8: Divide packed 8-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epi8'. Requires SSE.

func DivEpu16 ¶

func DivEpu16(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpu16: Divide packed unsigned 16-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epu16'. Requires SSE.

func DivEpu32 ¶

func DivEpu32(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epu32'. Requires SSE.

func DivEpu64 ¶

func DivEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpu64: Divide packed unsigned 64-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epu64'. Requires SSE.

func DivEpu8 ¶

func DivEpu8(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpu8: Divide packed unsigned 8-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epu8'. Requires SSE.

func DivPs ¶

func DivPs(a x86.M128, b x86.M128) (dst x86.M128)

DivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := a[i+31:i] / b[i+31:i]
ENDFOR

Instruction: 'DIVPS'. Intrinsic: '_mm_div_ps'. Requires SSE.

func DivSs ¶

func DivSs(a x86.M128, b x86.M128) (dst x86.M128)

DivSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := a[31:0] / b[31:0]
dst[127:32] := a[127:32]

Instruction: 'DIVSS'. Intrinsic: '_mm_div_ss'. Requires SSE.

func ErfPd ¶

func ErfPd(a x86.M128d) (dst x86.M128d)

ErfPd: Compute the error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erf_pd'. Requires SSE.

func ErfPs ¶

func ErfPs(a x86.M128) (dst x86.M128)

ErfPs: Compute the error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erf_ps'. Requires SSE.

func ErfcPd ¶

func ErfcPd(a x86.M128d) (dst x86.M128d)

ErfcPd: Compute the complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfc_pd'. Requires SSE.

func ErfcPs ¶

func ErfcPs(a x86.M128) (dst x86.M128)

ErfcPs: Compute the complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 1.0 - ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfc_ps'. Requires SSE.

func ErfcinvPd ¶

func ErfcinvPd(a x86.M128d) (dst x86.M128d)

ErfcinvPd: Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfcinv_pd'. Requires SSE.

func ErfcinvPs ¶

func ErfcinvPs(a x86.M128) (dst x86.M128)

ErfcinvPs: Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfcinv_ps'. Requires SSE.

func ErfinvPd ¶

func ErfinvPd(a x86.M128d) (dst x86.M128d)

ErfinvPd: Compute the inverse error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfinv_pd'. Requires SSE.

func ErfinvPs ¶

func ErfinvPs(a x86.M128) (dst x86.M128)

ErfinvPs: Compute the inverse error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 1.0 / ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfinv_ps'. Requires SSE.

func Exp10Pd ¶

func Exp10Pd(a x86.M128d) (dst x86.M128d)

Exp10Pd: Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 10^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp10_pd'. Requires SSE.

func Exp10Ps ¶

func Exp10Ps(a x86.M128) (dst x86.M128)

Exp10Ps: Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 10^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp10_ps'. Requires SSE.

func Exp2Pd ¶

func Exp2Pd(a x86.M128d) (dst x86.M128d)

Exp2Pd: Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 2^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp2_pd'. Requires SSE.

func Exp2Ps ¶

func Exp2Ps(a x86.M128) (dst x86.M128)

Exp2Ps: Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 2^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp2_ps'. Requires SSE.

func ExpPd ¶

func ExpPd(a x86.M128d) (dst x86.M128d)

ExpPd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := e^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp_pd'. Requires SSE.

func ExpPs ¶

func ExpPs(a x86.M128) (dst x86.M128)

ExpPs: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp_ps'. Requires SSE.

func Expm1Pd ¶

func Expm1Pd(a x86.M128d) (dst x86.M128d)

Expm1Pd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := e^(a[i+63:i]) - 1.0
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_expm1_pd'. Requires SSE.

func Expm1Ps ¶

func Expm1Ps(a x86.M128) (dst x86.M128)

Expm1Ps: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := e^(a[i+31:i]) - 1.0
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_expm1_ps'. Requires SSE.

func ExtractPi16 ¶

func ExtractPi16(a x86.M64, imm8 byte) int

ExtractPi16: Extract a 16-bit integer from 'a', selected with 'imm8', and store the result in the lower element of 'dst'.

dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0]
dst[31:16] := 0

Instruction: 'PEXTRW'. Intrinsic: '_mm_extract_pi16'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func Getcsr ¶

func Getcsr() uint32

Getcsr: Get the unsigned 32-bit value of the MXCSR control and status register.

dst[31:0] := MXCSR

Instruction: 'STMXCSR'. Intrinsic: '_mm_getcsr'. Requires SSE.

func HypotPd ¶

func HypotPd(a x86.M128d, b x86.M128d) (dst x86.M128d)

HypotPd: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_hypot_pd'. Requires SSE.

func HypotPs ¶

func HypotPs(a x86.M128, b x86.M128) (dst x86.M128)

HypotPs: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_hypot_ps'. Requires SSE.

func IdivEpi32 ¶

func IdivEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

IdivEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_idiv_epi32'. Requires SSE.

func IdivremEpi32 ¶

func IdivremEpi32(mem_addr *x86.M128i, a x86.M128i, b x86.M128i) (dst x86.M128i)

IdivremEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', store the truncated results in 'dst', and store the remainders as packed 32-bit integers into memory at 'mem_addr'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_idivrem_epi32'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func InsertPi16 ¶

func InsertPi16(a x86.M64, i int, imm8 byte) (dst x86.M64)

InsertPi16: Copy 'a' to 'dst', and insert the 16-bit integer 'i' into 'dst' at the location specified by 'imm8'.

dst[63:0] := a[63:0]
sel := imm8[1:0]*16
dst[sel+15:sel] := i[15:0]

Instruction: 'PINSRW'. Intrinsic: '_mm_insert_pi16'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func InvcbrtPd ¶

func InvcbrtPd(a x86.M128d) (dst x86.M128d)

InvcbrtPd: Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := InvCubeRoot(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_invcbrt_pd'. Requires SSE.

func InvcbrtPs ¶

func InvcbrtPs(a x86.M128) (dst x86.M128)

InvcbrtPs: Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := InvCubeRoot(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_invcbrt_ps'. Requires SSE.

func InvsqrtPd ¶

func InvsqrtPd(a x86.M128d) (dst x86.M128d)

InvsqrtPd: Compute the inverse square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := InvSQRT(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_invsqrt_pd'. Requires SSE.

func InvsqrtPs ¶

func InvsqrtPs(a x86.M128) (dst x86.M128)

InvsqrtPs: Compute the inverse square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := InvSQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_invsqrt_ps'. Requires SSE.

func IremEpi32 ¶

func IremEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

IremEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_irem_epi32'. Requires SSE.

func LoadhPi ¶

func LoadhPi(a x86.M128, mem_addr *x86.M64Const) (dst x86.M128)

LoadhPi: Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of 'dst', and copy the lower 2 elements from 'a' to 'dst'. 'mem_addr' does not need to be aligned on any particular boundary.

dst[31:0] := a[31:0]
dst[63:32] := a[63:32]
dst[95:64] := MEM[mem_addr+31:mem_addr]
dst[127:96] := MEM[mem_addr+63:mem_addr+32]

Instruction: 'MOVHPS'. Intrinsic: '_mm_loadh_pi'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func LoadlPi ¶

func LoadlPi(a x86.M128, mem_addr *x86.M64Const) (dst x86.M128)

LoadlPi: Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of 'dst', and copy the upper 2 elements from 'a' to 'dst'. 'mem_addr' does not need to be aligned on any particular boundary.

dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[63:32] := MEM[mem_addr+63:mem_addr+32]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]

Instruction: 'MOVLPS'. Intrinsic: '_mm_loadl_pi'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func Log10Pd ¶

func Log10Pd(a x86.M128d) (dst x86.M128d)

Log10Pd: Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := log10(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log10_pd'. Requires SSE.

func Log10Ps ¶

func Log10Ps(a x86.M128) (dst x86.M128)

Log10Ps: Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := log10(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log10_ps'. Requires SSE.

func Log1pPd ¶

func Log1pPd(a x86.M128d) (dst x86.M128d)

Log1pPd: Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ln(1.0 + a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log1p_pd'. Requires SSE.

func Log1pPs ¶

func Log1pPs(a x86.M128) (dst x86.M128)

Log1pPs: Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ln(1.0 + a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log1p_ps'. Requires SSE.

func Log2Pd ¶

func Log2Pd(a x86.M128d) (dst x86.M128d)

Log2Pd: Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := log2(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log2_pd'. Requires SSE.

func Log2Ps ¶

func Log2Ps(a x86.M128) (dst x86.M128)

Log2Ps: Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := log2(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log2_ps'. Requires SSE.

func LogPd ¶

func LogPd(a x86.M128d) (dst x86.M128d)

LogPd: Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ln(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log_pd'. Requires SSE.

func LogPs ¶

func LogPs(a x86.M128) (dst x86.M128)

LogPs: Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log_ps'. Requires SSE.

func LogbPd ¶

func LogbPd(a x86.M128d) (dst x86.M128d)

LogbPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_logb_pd'. Requires SSE.

func LogbPs ¶

func LogbPs(a x86.M128) (dst x86.M128)

LogbPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_logb_ps'. Requires SSE.

func MMGETEXCEPTIONMASK ¶

func MMGETEXCEPTIONMASK() uint32

MMGETEXCEPTIONMASK: Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT

dst[31:0] := MXCSR & _MM_MASK_MASK

Instruction: ”. Intrinsic: '_MM_GET_EXCEPTION_MASK'. Requires SSE.

func MMGETEXCEPTIONSTATE ¶

func MMGETEXCEPTIONSTATE() uint32

MMGETEXCEPTIONSTATE: Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT

dst[31:0] := MXCSR & _MM_EXCEPT_MASK

Instruction: ”. Intrinsic: '_MM_GET_EXCEPTION_STATE'. Requires SSE.

func MMGETFLUSHZEROMODE ¶

func MMGETFLUSHZEROMODE() uint32

MMGETFLUSHZEROMODE: Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF

dst[31:0] := MXCSR & _MM_FLUSH_MASK

Instruction: ”. Intrinsic: '_MM_GET_FLUSH_ZERO_MODE'. Requires SSE.

func MMGETROUNDINGMODE ¶

func MMGETROUNDINGMODE() uint32

MMGETROUNDINGMODE: Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO

dst[31:0] := MXCSR & _MM_ROUND_MASK

Instruction: ”. Intrinsic: '_MM_GET_ROUNDING_MODE'. Requires SSE.

func MMSETEXCEPTIONMASK ¶

func MMSETEXCEPTIONMASK(a uint32)

MMSETEXCEPTIONMASK: Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 'a'. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT

MXCSR := a[31:0] AND ~_MM_MASK_MASK

Instruction: ”. Intrinsic: '_MM_SET_EXCEPTION_MASK'. Requires SSE.

func MMSETEXCEPTIONSTATE ¶

func MMSETEXCEPTIONSTATE(a uint32)

MMSETEXCEPTIONSTATE: Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 'a'. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT

MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK

Instruction: ”. Intrinsic: '_MM_SET_EXCEPTION_STATE'. Requires SSE.

func MMSETFLUSHZEROMODE ¶

func MMSETFLUSHZEROMODE(a uint32)

MMSETFLUSHZEROMODE: Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 'a'. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF

MXCSR := a[31:0] AND ~_MM_FLUSH_MASK

Instruction: ”. Intrinsic: '_MM_SET_FLUSH_ZERO_MODE'. Requires SSE.

func MMSETROUNDINGMODE ¶

func MMSETROUNDINGMODE(a uint32)

MMSETROUNDINGMODE: Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 'a'. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO

MXCSR := a[31:0] AND ~_MM_ROUND_MASK

Instruction: ”. Intrinsic: '_MM_SET_ROUNDING_MODE'. Requires SSE.

func MMTRANSPOSE4PS ¶

func MMTRANSPOSE4PS(row0 x86.M128, row1 x86.M128, row2 x86.M128, row3 x86.M128)

MMTRANSPOSE4PS: Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in 'row0', 'row1', 'row2', and 'row3', and store the transposed matrix in these vectors ('row0' now contains column 0, etc.).

__m128 tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm_unpacklo_ps(row0, row1);
tmp2 = _mm_unpacklo_ps(row2, row3);
tmp1 = _mm_unpackhi_ps(row0, row1);
tmp3 = _mm_unpackhi_ps(row2, row3);
row0 = _mm_movelh_ps(tmp0, tmp2);
row1 = _mm_movehl_ps(tmp2, tmp0);
row2 = _mm_movelh_ps(tmp1, tmp3);
row3 = _mm_movehl_ps(tmp3, tmp1);

Instruction: '...'. Intrinsic: '_MM_TRANSPOSE4_PS'. Requires SSE.

func MaskmoveSi64 ¶

func MaskmoveSi64(a x86.M64, mask x86.M64, mem_addr *byte)

MaskmoveSi64: Conditionally store 8-bit integer elements from 'a' into memory using 'mask' (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint.

FOR j := 0 to 7
	i := j*8
	IF mask[i+7]
		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
	FI
ENDFOR

Instruction: 'MASKMOVQ'. Intrinsic: '_mm_maskmove_si64'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func Maskmovq ¶

func Maskmovq(a x86.M64, mask x86.M64, mem_addr *byte)

Maskmovq: Conditionally store 8-bit integer elements from 'a' into memory using 'mask' (elements are not stored when the highest bit is not set in the corresponding element).

FOR j := 0 to 7
	i := j*8
	IF mask[i+7]
		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
	FI
ENDFOR

Instruction: 'MASKMOVQ'. Intrinsic: '_m_maskmovq'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaxPi16 ¶

func MaxPi16(a x86.M64, b x86.M64) (dst x86.M64)

MaxPi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] > b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR

Instruction: 'PMAXSW'. Intrinsic: '_mm_max_pi16'. Requires SSE.

func MaxPs ¶

func MaxPs(a x86.M128, b x86.M128) (dst x86.M128)

MaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ENDFOR

Instruction: 'MAXPS'. Intrinsic: '_mm_max_ps'. Requires SSE.

func MaxPu8 ¶

func MaxPu8(a x86.M64, b x86.M64) (dst x86.M64)

MaxPu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] > b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR

Instruction: 'PMAXUB'. Intrinsic: '_mm_max_pu8'. Requires SSE.

func MaxSs ¶

func MaxSs(a x86.M128, b x86.M128) (dst x86.M128)

MaxSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[31:0] := MAX(a[31:0], b[31:0])
dst[127:32] := a[127:32]

Instruction: 'MAXSS'. Intrinsic: '_mm_max_ss'. Requires SSE.

func MinPi16 ¶

func MinPi16(a x86.M64, b x86.M64) (dst x86.M64)

MinPi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] < b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR

Instruction: 'PMINSW'. Intrinsic: '_mm_min_pi16'. Requires SSE.

func MinPs ¶

func MinPs(a x86.M128, b x86.M128) (dst x86.M128)

MinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ENDFOR

Instruction: 'MINPS'. Intrinsic: '_mm_min_ps'. Requires SSE.

func MinPu8 ¶

func MinPu8(a x86.M64, b x86.M64) (dst x86.M64)

MinPu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] < b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR

Instruction: 'PMINUB'. Intrinsic: '_mm_min_pu8'. Requires SSE.

func MinSs ¶

func MinSs(a x86.M128, b x86.M128) (dst x86.M128)

MinSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[31:0] := MIN(a[31:0], b[31:0])
dst[127:32] := a[127:32]

Instruction: 'MINSS'. Intrinsic: '_mm_min_ss'. Requires SSE.

func MoveSs ¶

func MoveSs(a x86.M128, b x86.M128) (dst x86.M128)

MoveSs: Move the lower single-precision (32-bit) floating-point element from 'b' to the lower element of 'dst', and copy the upper 3 elements from 'a' to the upper elements of 'dst'.

dst[31:0] := b[31:0]
dst[63:32] := a[63:32]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]

Instruction: 'MOVSS'. Intrinsic: '_mm_move_ss'. Requires SSE.

func MovehlPs ¶

func MovehlPs(a x86.M128, b x86.M128) (dst x86.M128)

MovehlPs: Move the upper 2 single-precision (32-bit) floating-point elements from 'b' to the lower 2 elements of 'dst', and copy the upper 2 elements from 'a' to the upper 2 elements of 'dst'.

dst[31:0] := b[95:64]
dst[63:32] := b[127:96]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]

Instruction: 'MOVHLPS'. Intrinsic: '_mm_movehl_ps'. Requires SSE.

func MovelhPs ¶

func MovelhPs(a x86.M128, b x86.M128) (dst x86.M128)

MovelhPs: Move the lower 2 single-precision (32-bit) floating-point elements from 'b' to the upper 2 elements of 'dst', and copy the lower 2 elements from 'a' to the lower 2 elements of 'dst'.

dst[31:0] := a[31:0]
dst[63:32] := a[63:32]
dst[95:64] := b[31:0]
dst[127:96] := b[63:32]

Instruction: 'MOVLHPS'. Intrinsic: '_mm_movelh_ps'. Requires SSE.

func MovemaskPi8 ¶

func MovemaskPi8(a x86.M64) int

MovemaskPi8: Create mask from the most significant bit of each 8-bit element in 'a', and store the result in 'dst'.

FOR j := 0 to 7
	i := j*8
	dst[j] := a[i+7]
ENDFOR
dst[MAX:8] := 0

Instruction: 'PMOVMSKB'. Intrinsic: '_mm_movemask_pi8'. Requires SSE.

func MovemaskPs ¶

func MovemaskPs(a x86.M128) int

MovemaskPs: Set each bit of mask 'dst' based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in 'a'.

FOR j := 0 to 3
	i := j*32
	IF a[i+31]
		dst[j] := 1
	ELSE
		dst[j] := 0
	FI
ENDFOR
dst[MAX:4] := 0

Instruction: 'MOVMSKPS'. Intrinsic: '_mm_movemask_ps'. Requires SSE.

func MulPs ¶

func MulPs(a x86.M128, b x86.M128) (dst x86.M128)

MulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] * b[i+31:i]
ENDFOR

Instruction: 'MULPS'. Intrinsic: '_mm_mul_ps'. Requires SSE.

func MulSs ¶

func MulSs(a x86.M128, b x86.M128) (dst x86.M128)

MulSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := a[31:0] * b[31:0]
dst[127:32] := a[127:32]

Instruction: 'MULSS'. Intrinsic: '_mm_mul_ss'. Requires SSE.

func MulhiPu16 ¶

func MulhiPu16(a x86.M64, b x86.M64) (dst x86.M64)

MulhiPu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 3
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR

Instruction: 'PMULHUW'. Intrinsic: '_mm_mulhi_pu16'. Requires SSE.

func OrPs ¶

func OrPs(a x86.M128, b x86.M128) (dst x86.M128)

OrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ENDFOR

Instruction: 'ORPS'. Intrinsic: '_mm_or_ps'. Requires SSE.

func Pavgb ¶

func Pavgb(a x86.M64, b x86.M64) (dst x86.M64)

Pavgb: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR

Instruction: 'PAVGB'. Intrinsic: '_m_pavgb'. Requires SSE.

func Pavgw ¶

func Pavgw(a x86.M64, b x86.M64) (dst x86.M64)

Pavgw: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR

Instruction: 'PAVGW'. Intrinsic: '_m_pavgw'. Requires SSE.

func Pextrw ¶

func Pextrw(a x86.M64, imm8 byte) int

Pextrw: Extract a 16-bit integer from 'a', selected with 'imm8', and store the result in the lower element of 'dst'.

dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0]
dst[31:16] := 0

Instruction: 'PEXTRW'. Intrinsic: '_m_pextrw'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func Pinsrw ¶

func Pinsrw(a x86.M64, i int, imm8 byte) (dst x86.M64)

Pinsrw: Copy 'a' to 'dst', and insert the 16-bit integer 'i' into 'dst' at the location specified by 'imm8'.

dst[63:0] := a[63:0]
sel := imm8[1:0]*16
dst[sel+15:sel] := i[15:0]

Instruction: 'PINSRW'. Intrinsic: '_m_pinsrw'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func Pmaxsw ¶

func Pmaxsw(a x86.M64, b x86.M64) (dst x86.M64)

Pmaxsw: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] > b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR

Instruction: 'PMAXSW'. Intrinsic: '_m_pmaxsw'. Requires SSE.

func Pmaxub ¶

func Pmaxub(a x86.M64, b x86.M64) (dst x86.M64)

Pmaxub: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] > b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR

Instruction: 'PMAXUB'. Intrinsic: '_m_pmaxub'. Requires SSE.

func Pminsw ¶

func Pminsw(a x86.M64, b x86.M64) (dst x86.M64)

Pminsw: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] < b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR

Instruction: 'PMINSW'. Intrinsic: '_m_pminsw'. Requires SSE.

func Pminub ¶

func Pminub(a x86.M64, b x86.M64) (dst x86.M64)

Pminub: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] < b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR

Instruction: 'PMINUB'. Intrinsic: '_m_pminub'. Requires SSE.

func Pmovmskb ¶

func Pmovmskb(a x86.M64) int

Pmovmskb: Create mask from the most significant bit of each 8-bit element in 'a', and store the result in 'dst'.

FOR j := 0 to 7
	i := j*8
	dst[j] := a[i+7]
ENDFOR
dst[MAX:8] := 0

Instruction: 'PMOVMSKB'. Intrinsic: '_m_pmovmskb'. Requires SSE.

func Pmulhuw ¶

func Pmulhuw(a x86.M64, b x86.M64) (dst x86.M64)

Pmulhuw: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 3
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR

Instruction: 'PMULHUW'. Intrinsic: '_m_pmulhuw'. Requires SSE.

func PowPd ¶

func PowPd(a x86.M128d, b x86.M128d) (dst x86.M128d)

PowPd: Compute the exponential value of packed double-precision (64-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_pow_pd'. Requires SSE.

func PowPs ¶

func PowPs(a x86.M128, b x86.M128) (dst x86.M128)

PowPs: Compute the exponential value of packed single-precision (32-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_pow_ps'. Requires SSE.

func Prefetch ¶

func Prefetch(p *byte, i int)

Prefetch: Fetch the line of data from memory that contains address 'p' to a location in the cache heirarchy specified by the locality hint 'i'.

Instruction: 'PREFETCHNTA, PREFETCHT0, PREFETCHT1, PREFETCHT2'. Intrinsic: '_mm_prefetch'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func Psadbw ¶

func Psadbw(a x86.M64, b x86.M64) (dst x86.M64)

Psadbw: Compute the absolute differences of packed unsigned 8-bit integers in 'a' and 'b', then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 'dst'.

FOR j := 0 to 7
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR

dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
dst[63:16] := 0

Instruction: 'PSADBW'. Intrinsic: '_m_psadbw'. Requires SSE.

func Pshufw ¶

func Pshufw(a x86.M64, imm8 byte) (dst x86.M64)

Pshufw: Shuffle 16-bit integers in 'a' using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[15:0] := src[15:0]
	1:	tmp[15:0] := src[31:16]
	2:	tmp[15:0] := src[47:32]
	3:	tmp[15:0] := src[63:48]
	ESAC
	RETURN tmp[15:0]
}

dst[15:0] := SELECT4(a[63:0], imm8[1:0])
dst[31:16] := SELECT4(a[63:0], imm8[3:2])
dst[47:32] := SELECT4(a[63:0], imm8[5:4])
dst[63:48] := SELECT4(a[63:0], imm8[7:6])

Instruction: 'PSHUFW'. Intrinsic: '_m_pshufw'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func RcpPs ¶

func RcpPs(a x86.M128) (dst x86.M128)

RcpPs: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 1.5*2^-12.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR

Instruction: 'RCPPS'. Intrinsic: '_mm_rcp_ps'. Requires SSE.

func RcpSs ¶

func RcpSs(a x86.M128) (dst x86.M128)

RcpSs: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 1.5*2^-12.

dst[31:0] := APPROXIMATE(1.0/a[31:0])
dst[127:32] := a[127:32]

Instruction: 'RCPSS'. Intrinsic: '_mm_rcp_ss'. Requires SSE.

func RemEpi16 ¶

func RemEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpi16: Divide packed 16-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epi16'. Requires SSE.

func RemEpi32 ¶

func RemEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epi32'. Requires SSE.

func RemEpi64 ¶

func RemEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpi64: Divide packed 64-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epi64'. Requires SSE.

func RemEpi8 ¶

func RemEpi8(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpi8: Divide packed 8-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epi8'. Requires SSE.

func RemEpu16 ¶

func RemEpu16(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpu16: Divide packed unsigned 16-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epu16'. Requires SSE.

func RemEpu32 ¶

func RemEpu32(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epu32'. Requires SSE.

func RemEpu64 ¶

func RemEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpu64: Divide packed unsigned 64-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epu64'. Requires SSE.

func RemEpu8 ¶

func RemEpu8(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpu8: Divide packed unsigned 8-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epu8'. Requires SSE.

func RsqrtPs ¶

func RsqrtPs(a x86.M128) (dst x86.M128)

RsqrtPs: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 1.5*2^-12.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ENDFOR

Instruction: 'RSQRTPS'. Intrinsic: '_mm_rsqrt_ps'. Requires SSE.

func RsqrtSs ¶

func RsqrtSs(a x86.M128) (dst x86.M128)

RsqrtSs: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 1.5*2^-12.

dst[31:0] := APPROXIMATE(1.0 / SQRT(a[31:0]))
dst[127:32] := a[127:32]

Instruction: 'RSQRTSS'. Intrinsic: '_mm_rsqrt_ss'. Requires SSE.

func SadPu8 ¶

func SadPu8(a x86.M64, b x86.M64) (dst x86.M64)

SadPu8: Compute the absolute differences of packed unsigned 8-bit integers in 'a' and 'b', then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 'dst'.

FOR j := 0 to 7
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR

dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
dst[63:16] := 0

Instruction: 'PSADBW'. Intrinsic: '_mm_sad_pu8'. Requires SSE.

func Set1Ps ¶

func Set1Ps(a float32) (dst x86.M128)

Set1Ps: Broadcast single-precision (32-bit) floating-point value 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR

Instruction: '...'. Intrinsic: '_mm_set1_ps'. Requires SSE.

func SetPs ¶

func SetPs(e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M128)

SetPs: Set packed single-precision (32-bit) floating-point elements in 'dst' with the supplied values.

dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3

Instruction: '...'. Intrinsic: '_mm_set_ps'. Requires SSE.

func SetPs1 ¶

func SetPs1(a float32) (dst x86.M128)

SetPs1: Broadcast single-precision (32-bit) floating-point value 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR

Instruction: '...'. Intrinsic: '_mm_set_ps1'. Requires SSE.

func SetSs ¶

func SetSs(a float32) (dst x86.M128)

SetSs: Copy single-precision (32-bit) floating-point element 'a' to the lower element of 'dst', and zero the upper 3 elements.

dst[31:0] := a[31:0]
dst[127:32] := 0

Instruction: '...'. Intrinsic: '_mm_set_ss'. Requires SSE.

func Setcsr ¶

func Setcsr(a uint32)

Setcsr: Set the MXCSR control and status register with the value in unsigned 32-bit integer 'a'.

MXCSR := a[31:0]

Instruction: 'LDMXCSR'. Intrinsic: '_mm_setcsr'. Requires SSE.

func SetrPs ¶

func SetrPs(e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M128)

SetrPs: Set packed single-precision (32-bit) floating-point elements in 'dst' with the supplied values in reverse order.

dst[31:0] := e3
dst[63:32] := e2
dst[95:64] := e1
dst[127:96] := e0

Instruction: '...'. Intrinsic: '_mm_setr_ps'. Requires SSE.

func SetzeroPs ¶

func SetzeroPs() (dst x86.M128)

SetzeroPs: Return vector of type __m128 with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'XORPS'. Intrinsic: '_mm_setzero_ps'. Requires SSE.

func Sfence ¶

func Sfence()

Sfence: Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order.

Instruction: 'SFENCE'. Intrinsic: '_mm_sfence'. Requires SSE.

func ShufflePi16 ¶

func ShufflePi16(a x86.M64, imm8 byte) (dst x86.M64)

ShufflePi16: Shuffle 16-bit integers in 'a' using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[15:0] := src[15:0]
	1:	tmp[15:0] := src[31:16]
	2:	tmp[15:0] := src[47:32]
	3:	tmp[15:0] := src[63:48]
	ESAC
	RETURN tmp[15:0]
}

dst[15:0] := SELECT4(a[63:0], imm8[1:0])
dst[31:16] := SELECT4(a[63:0], imm8[3:2])
dst[47:32] := SELECT4(a[63:0], imm8[5:4])
dst[63:48] := SELECT4(a[63:0], imm8[7:6])

Instruction: 'PSHUFW'. Intrinsic: '_mm_shuffle_pi16'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func ShufflePs ¶

func ShufflePs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

ShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(b[127:0], imm8[5:4])
dst[127:96] := SELECT4(b[127:0], imm8[7:6])

Instruction: 'SHUFPS'. Intrinsic: '_mm_shuffle_ps'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func SinPd ¶

func SinPd(a x86.M128d) (dst x86.M128d)

SinPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SIN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sin_pd'. Requires SSE.

func SinPs ¶

func SinPs(a x86.M128) (dst x86.M128)

SinPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SIN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sin_ps'. Requires SSE.

func SincosPd ¶

func SincosPd(mem_addr *x86.M128d, a x86.M128d) (dst x86.M128d)

SincosPd: Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, store the sine in 'dst', and store the cosine into memory at 'mem_addr'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SIN(a[i+63:i])
	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sincos_pd'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func SincosPs ¶

func SincosPs(mem_addr *x86.M128, a x86.M128) (dst x86.M128)

SincosPs: Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, store the sine in 'dst', and store the cosine into memory at 'mem_addr'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SIN(a[i+31:i])
	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sincos_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func SindPd ¶

func SindPd(a x86.M128d) (dst x86.M128d)

SindPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SIND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sind_pd'. Requires SSE.

func SindPs ¶

func SindPs(a x86.M128) (dst x86.M128)

SindPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SIND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sind_ps'. Requires SSE.

func SinhPd ¶

func SinhPd(a x86.M128d) (dst x86.M128d)

SinhPd: Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SINH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sinh_pd'. Requires SSE.

func SinhPs ¶

func SinhPs(a x86.M128) (dst x86.M128)

SinhPs: Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SINH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sinh_ps'. Requires SSE.

func SqrtPs ¶

func SqrtPs(a x86.M128) (dst x86.M128)

SqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR

Instruction: 'SQRTPS'. Intrinsic: '_mm_sqrt_ps'. Requires SSE.

func SqrtSs ¶

func SqrtSs(a x86.M128) (dst x86.M128)

SqrtSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := SQRT(a[31:0])
dst[127:32] := a[127:32]

Instruction: 'SQRTSS'. Intrinsic: '_mm_sqrt_ss'. Requires SSE.

func Store1Ps ¶

func Store1Ps(mem_addr *float32, a x86.M128)

Store1Ps: Store the lower single-precision (32-bit) floating-point element from 'a' into 4 contiguous elements in memory. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[31:0]
MEM[mem_addr+95:mem_addr+64] := a[31:0]
MEM[mem_addr+127:mem_addr+96] := a[31:0]

Instruction: '...'. Intrinsic: '_mm_store1_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StorePs ¶

func StorePs(mem_addr *float32, a x86.M128)

StorePs: Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a' into memory.

'mem_addr' must be aligned on a 16-byte boundary or a general-protection

exception may be generated.

MEM[mem_addr+127:mem_addr] := a[127:0]

Instruction: 'MOVAPS'. Intrinsic: '_mm_store_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StorePs1 ¶

func StorePs1(mem_addr *float32, a x86.M128)

StorePs1: Store the lower single-precision (32-bit) floating-point element from 'a' into 4 contiguous elements in memory. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[31:0]
MEM[mem_addr+95:mem_addr+64] := a[31:0]
MEM[mem_addr+127:mem_addr+96] := a[31:0]

Instruction: '...'. Intrinsic: '_mm_store_ps1'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StoreSs ¶

func StoreSs(mem_addr *float32, a x86.M128)

StoreSs: Store the lower single-precision (32-bit) floating-point element from 'a' into memory. 'mem_addr' does not need to be aligned on any particular boundary.

MEM[mem_addr+31:mem_addr] := a[31:0]

Instruction: 'MOVSS'. Intrinsic: '_mm_store_ss'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StorehPi ¶

func StorehPi(mem_addr *x86.M64, a x86.M128)

StorehPi: Store the upper 2 single-precision (32-bit) floating-point elements from 'a' into memory.

MEM[mem_addr+31:mem_addr] := a[95:64]
MEM[mem_addr+63:mem_addr+32] := a[127:96]

Instruction: 'MOVHPS'. Intrinsic: '_mm_storeh_pi'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StorelPi ¶

func StorelPi(mem_addr *x86.M64, a x86.M128)

StorelPi: Store the lower 2 single-precision (32-bit) floating-point elements from 'a' into memory.

MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[63:32]

Instruction: 'MOVLPS'. Intrinsic: '_mm_storel_pi'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StorerPs ¶

func StorerPs(mem_addr *float32, a x86.M128)

StorerPs: Store 4 single-precision (32-bit) floating-point elements from 'a' into memory in reverse order.

'mem_addr' must be aligned on a 16-byte boundary or a general-protection

exception may be generated.

MEM[mem_addr+31:mem_addr] := a[127:96]
MEM[mem_addr+63:mem_addr+32] := a[95:64]
MEM[mem_addr+95:mem_addr+64] := a[63:32]
MEM[mem_addr+127:mem_addr+96] := a[31:0]

Instruction: '...'. Intrinsic: '_mm_storer_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StoreuPs ¶

func StoreuPs(mem_addr *float32, a x86.M128)

StoreuPs: Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a' into memory.

'mem_addr' does not need to be aligned on any particular boundary.

	MEM[mem_addr+127:mem_addr] := a[127:0]

Instruction: 'MOVUPS'. Intrinsic: '_mm_storeu_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StreamPi ¶

func StreamPi(mem_addr *x86.M64, a x86.M64)

StreamPi: Store 64-bits of integer data from 'a' into memory using a non-temporal memory hint.

MEM[mem_addr+63:mem_addr] := a[63:0]

Instruction: 'MOVNTQ'. Intrinsic: '_mm_stream_pi'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StreamPs ¶

func StreamPs(mem_addr *float32, a x86.M128)

StreamPs: Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a' into memory using a non-temporal memory hint.

'mem_addr' must be aligned on a 16-byte boundary or a general-protection

exception may be generated.

MEM[mem_addr+127:mem_addr] := a[127:0]

Instruction: 'MOVNTPS'. Intrinsic: '_mm_stream_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func SubPs ¶

func SubPs(a x86.M128, b x86.M128) (dst x86.M128)

SubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR

Instruction: 'SUBPS'. Intrinsic: '_mm_sub_ps'. Requires SSE.

func SubSs ¶

func SubSs(a x86.M128, b x86.M128) (dst x86.M128)

SubSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := a[31:0] - b[31:0]
dst[127:32] := a[127:32]

Instruction: 'SUBSS'. Intrinsic: '_mm_sub_ss'. Requires SSE.

func SvmlCeilPd ¶

func SvmlCeilPd(a x86.M128d) (dst x86.M128d)

SvmlCeilPd: Round the packed double-precision (64-bit) floating-point elements in 'a' up to an integer value, and store the results as packed double-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundpd'/'vroundpd' instruction.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_ceil_pd'. Requires SSE.

func SvmlCeilPs ¶

func SvmlCeilPs(a x86.M128) (dst x86.M128)

SvmlCeilPs: Round the packed single-precision (32-bit) floating-point elements in 'a' up to an integer value, and store the results as packed single-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundps'/'vroundps' instruction.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_ceil_ps'. Requires SSE.

func SvmlFloorPd ¶

func SvmlFloorPd(a x86.M128d) (dst x86.M128d)

SvmlFloorPd: Round the packed double-precision (64-bit) floating-point elements in 'a' down to an integer value, and store the results as packed double-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundpd'/'vroundpd' instruction.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_floor_pd'. Requires SSE.

func SvmlFloorPs ¶

func SvmlFloorPs(a x86.M128) (dst x86.M128)

SvmlFloorPs: Round the packed single-precision (32-bit) floating-point elements in 'a' down to an integer value, and store the results as packed single-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundps'/'vroundps' instruction.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_floor_ps'. Requires SSE.

func SvmlRoundPd ¶

func SvmlRoundPd(a x86.M128d) (dst x86.M128d)

SvmlRoundPd: Round the packed double-precision (64-bit) floating-point elements in 'a' to the nearest integer value, and store the results as packed double-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundpd'/'vroundpd' instruction.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_round_pd'. Requires SSE.

func SvmlRoundPs ¶

func SvmlRoundPs(a x86.M128) (dst x86.M128)

SvmlRoundPs: Round the packed single-precision (32-bit) floating-point elements in 'a' to the nearest integer value, and store the results as packed single-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundps'/'vroundps' instruction.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ROUND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_round_ps'. Requires SSE.

func SvmlSqrtPd ¶

func SvmlSqrtPd(a x86.M128d) (dst x86.M128d)

SvmlSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. Note that this intrinsic is less efficient than '_mm_sqrt_pd'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_sqrt_pd'. Requires SSE.

func SvmlSqrtPs ¶

func SvmlSqrtPs(a x86.M128) (dst x86.M128)

SvmlSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. Note that this intrinsic is less efficient than '_mm_sqrt_ps'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_sqrt_ps'. Requires SSE.

func TanPd ¶

func TanPd(a x86.M128d) (dst x86.M128d)

TanPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := TAN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tan_pd'. Requires SSE.

func TanPs ¶

func TanPs(a x86.M128) (dst x86.M128)

TanPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := TAN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tan_ps'. Requires SSE.

func TandPd ¶

func TandPd(a x86.M128d) (dst x86.M128d)

TandPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := TAND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tand_pd'. Requires SSE.

func TandPs ¶

func TandPs(a x86.M128) (dst x86.M128)

TandPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := TAND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tand_ps'. Requires SSE.

func TanhPd ¶

func TanhPd(a x86.M128d) (dst x86.M128d)

TanhPd: Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := TANH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tanh_pd'. Requires SSE.

func TanhPs ¶

func TanhPs(a x86.M128) (dst x86.M128)

TanhPs: Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := TANH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tanh_ps'. Requires SSE.

func TruncPd ¶

func TruncPd(a x86.M128d) (dst x86.M128d)

TruncPd: Truncate the packed double-precision (64-bit) floating-point elements in 'a', and store the results as packed double-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundpd'/'vroundpd' instruction.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := TRUNCATE(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_trunc_pd'. Requires SSE.

func TruncPs ¶

func TruncPs(a x86.M128) (dst x86.M128)

TruncPs: Truncate the packed single-precision (32-bit) floating-point elements in 'a', and store the results as packed single-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundps'/'vroundps' instruction.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := TRUNCATE(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_trunc_ps'. Requires SSE.

func UcomieqSs ¶

func UcomieqSs(a x86.M128, b x86.M128) int

UcomieqSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] == b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomieq_ss'. Requires SSE.

func UcomigeSs ¶

func UcomigeSs(a x86.M128, b x86.M128) int

UcomigeSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] >= b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomige_ss'. Requires SSE.

func UcomigtSs ¶

func UcomigtSs(a x86.M128, b x86.M128) int

UcomigtSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] > b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomigt_ss'. Requires SSE.

func UcomileSs ¶

func UcomileSs(a x86.M128, b x86.M128) int

UcomileSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] <= b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomile_ss'. Requires SSE.

func UcomiltSs ¶

func UcomiltSs(a x86.M128, b x86.M128) int

UcomiltSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] < b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomilt_ss'. Requires SSE.

func UcomineqSs ¶

func UcomineqSs(a x86.M128, b x86.M128) int

UcomineqSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] != b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomineq_ss'. Requires SSE.

func UdivEpi32 ¶

func UdivEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

UdivEpi32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_udiv_epi32'. Requires SSE.

func UdivremEpi32 ¶

func UdivremEpi32(mem_addr *x86.M128i, a x86.M128i, b x86.M128i) (dst x86.M128i)

UdivremEpi32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', store the truncated results in 'dst', and store the remainders as packed unsigned 32-bit integers into memory at 'mem_addr'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_udivrem_epi32'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func UnpackhiPs ¶

func UnpackhiPs(a x86.M128, b x86.M128) (dst x86.M128)

UnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

Instruction: 'UNPCKHPS'. Intrinsic: '_mm_unpackhi_ps'. Requires SSE.

func UnpackloPs ¶

func UnpackloPs(a x86.M128, b x86.M128) (dst x86.M128)

UnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

Instruction: 'UNPCKLPS'. Intrinsic: '_mm_unpacklo_ps'. Requires SSE.

func UremEpi32 ¶

func UremEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

UremEpi32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_urem_epi32'. Requires SSE.

func XorPs ¶

func XorPs(a x86.M128, b x86.M128) (dst x86.M128)

XorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR

Instruction: 'XORPS'. Intrinsic: '_mm_xor_ps'. Requires SSE.

Types ¶

This section is empty.

Source Files ¶

View all Source files

sse.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL