Documentation ¶
Overview ¶
Package cu provides an idiomatic interface to the CUDA Driver API.
Index ¶
- Constants
- Variables
- func AverageQueueLength() int
- func BlockingCallers() map[string]int
- func DestroyEvent(event *Event) (err error)
- func LauncCooperativeKernelMultiDevice()
- func LaunchCooperativeKernel()
- func Limits(limit Limit) (pvalue int64, err error)
- func MemFree(dptr DevicePtr) (err error)
- func MemFreeHost(p unsafe.Pointer) (err error)
- func MemInfo() (free int64, total int64, err error)
- func Memcpy(dst DevicePtr, src DevicePtr, ByteCount int64) (err error)
- func Memcpy2D(pCopy Memcpy2dParam) (err error)
- func Memcpy2DAsync(pCopy Memcpy2dParam, hStream Stream) (err error)
- func Memcpy2DUnaligned(pCopy Memcpy2dParam) (err error)
- func Memcpy3D(pCopy Memcpy3dParam) (err error)
- func Memcpy3DAsync(pCopy Memcpy3dParam, hStream Stream) (err error)
- func Memcpy3DPeer(pCopy Memcpy3dPeerParam) (err error)
- func Memcpy3DPeerAsync(pCopy Memcpy3dPeerParam, hStream Stream) (err error)
- func MemcpyAsync(dst DevicePtr, src DevicePtr, ByteCount int64, hStream Stream) (err error)
- func MemcpyAtoA(dstArray Array, dstOffset int64, srcArray Array, srcOffset int64, ...) (err error)
- func MemcpyAtoD(dstDevice DevicePtr, srcArray Array, srcOffset int64, ByteCount int64) (err error)
- func MemcpyAtoH(dstHost unsafe.Pointer, srcArray Array, srcOffset int64, ByteCount int64) (err error)
- func MemcpyAtoHAsync(dstHost unsafe.Pointer, srcArray Array, srcOffset int64, ByteCount int64, ...) (err error)
- func MemcpyDtoA(dstArray Array, dstOffset int64, srcDevice DevicePtr, ByteCount int64) (err error)
- func MemcpyDtoD(dstDevice DevicePtr, srcDevice DevicePtr, ByteCount int64) (err error)
- func MemcpyDtoDAsync(dstDevice DevicePtr, srcDevice DevicePtr, ByteCount int64, hStream Stream) (err error)
- func MemcpyDtoH(dstHost unsafe.Pointer, srcDevice DevicePtr, ByteCount int64) (err error)
- func MemcpyDtoHAsync(dstHost unsafe.Pointer, srcDevice DevicePtr, ByteCount int64, hStream Stream) (err error)
- func MemcpyHtoA(dstArray Array, dstOffset int64, srcHost unsafe.Pointer, ByteCount int64) (err error)
- func MemcpyHtoAAsync(dstArray Array, dstOffset int64, srcHost unsafe.Pointer, ByteCount int64, ...) (err error)
- func MemcpyHtoD(dstDevice DevicePtr, srcHost unsafe.Pointer, ByteCount int64) (err error)
- func MemcpyHtoDAsync(dstDevice DevicePtr, srcHost unsafe.Pointer, ByteCount int64, hStream Stream) (err error)
- func MemcpyPeer(dstDevice DevicePtr, dstContext CUContext, srcDevice DevicePtr, ...) (err error)
- func MemcpyPeerAsync(dstDevice DevicePtr, dstContext CUContext, srcDevice DevicePtr, ...) (err error)
- func MemsetD16(dstDevice DevicePtr, us uint16, N int64) (err error)
- func MemsetD16Async(dstDevice DevicePtr, us uint16, N int64, hStream Stream) (err error)
- func MemsetD2D16(dstDevice DevicePtr, dstPitch int64, us uint16, Width int64, Height int64) (err error)
- func MemsetD2D16Async(dstDevice DevicePtr, dstPitch int64, us uint16, Width int64, Height int64, ...) (err error)
- func MemsetD2D32(dstDevice DevicePtr, dstPitch int64, ui uint, Width int64, Height int64) (err error)
- func MemsetD2D32Async(dstDevice DevicePtr, dstPitch int64, ui uint, Width int64, Height int64, ...) (err error)
- func MemsetD2D8(dstDevice DevicePtr, dstPitch int64, uc byte, Width int64, Height int64) (err error)
- func MemsetD2D8Async(dstDevice DevicePtr, dstPitch int64, uc byte, Width int64, Height int64, ...) (err error)
- func MemsetD32(dstDevice DevicePtr, ui uint32, N int64) (err error)
- func MemsetD32Async(dstDevice DevicePtr, ui uint, N int64, hStream Stream) (err error)
- func MemsetD8(dstDevice DevicePtr, uc byte, N int64) (err error)
- func MemsetD8Async(dstDevice DevicePtr, uc byte, N int64, hStream Stream) (err error)
- func NumDevices() (count int, err error)
- func PushCurrentCtx(ctx CUContext) (err error)
- func QueueLengths() []int
- func RegisterFunc(fn HostFunction) unsafe.Pointer
- func SetCurrentCacheConfig(config FuncCacheConfig) (err error)
- func SetCurrentContext(ctx CUContext) (err error)
- func SetLimit(limit Limit, value int64) (err error)
- func SetSharedMemConfig(config SharedConfig) (err error)
- func StreamPriorityRange() (leastPriority int, greatestPriority int, err error)
- func Synchronize() (err error)
- func Version() int
- type AddressMode
- type Array
- type Array3Desc
- type ArrayDesc
- type BatchedContext
- func (ctx *BatchedContext) AllocAndCopy(p unsafe.Pointer, bytesize int64) (retVal DevicePtr, err error)
- func (ctx *BatchedContext) Cleanup()
- func (ctx *BatchedContext) Close() error
- func (ctx *BatchedContext) DoWork()
- func (ctx *BatchedContext) Errors() error
- func (ctx *BatchedContext) FirstError() error
- func (ctx *BatchedContext) IsInitialized() bool
- func (ctx *BatchedContext) LaunchAndSync(function Function, gridDimX, gridDimY, gridDimZ int, ...)
- func (ctx *BatchedContext) LaunchKernel(function Function, gridDimX, gridDimY, gridDimZ int, ...)
- func (ctx *BatchedContext) MemAlloc(bytesize int64) (retVal DevicePtr, err error)
- func (ctx *BatchedContext) MemAllocManaged(bytesize int64, flags MemAttachFlags) (retVal DevicePtr, err error)
- func (ctx *BatchedContext) MemFree(mem DevicePtr)
- func (ctx *BatchedContext) MemFreeHost(p unsafe.Pointer)
- func (ctx *BatchedContext) Memcpy(dst, src DevicePtr, byteCount int64)
- func (ctx *BatchedContext) MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, byteCount int64)
- func (ctx *BatchedContext) MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, byteCount int64)
- func (ctx *BatchedContext) Run(errChan chan error) error
- func (ctx *BatchedContext) SetCurrent()
- func (ctx *BatchedContext) Signal()
- func (ctx *BatchedContext) Synchronize()
- func (ctx *BatchedContext) WorkAvailable() <-chan struct{}
- type CUContext
- func (ctx CUContext) APIVersion() (version uint, err error)
- func (ctx *CUContext) Destroy() error
- func (peerContext CUContext) DisablePeerAccess() (err error)
- func (peerContext CUContext) EnablePeerAccess(Flags uint) (err error)
- func (ctx CUContext) Lock() error
- func (ctx *CUContext) ResetL2Cache() error
- func (ctx CUContext) String() string
- func (ctx CUContext) Unlock() error
- type Context
- type ContextFlags
- type CopyParams
- type Ctx
- func (ctx *Ctx) Address(hTexRef TexRef) (pdptr DevicePtr, err error)
- func (ctx *Ctx) AddressMode(hTexRef TexRef, dim int) (pam AddressMode, err error)
- func (ctx *Ctx) Array(hTexRef TexRef) (phArray Array, err error)
- func (ctx *Ctx) AttachMemAsync(hStream Stream, dptr DevicePtr, length int64, flags uint)
- func (ctx *Ctx) BorderColor(hTexRef TexRef) (pBorderColor [3]float32, err error)
- func (ctx *Ctx) CUDAContext() CUContext
- func (ctx *Ctx) CanAccessPeer(dev Device, peerDev Device) (canAccessPeer int, err error)
- func (ctx *Ctx) Close() error
- func (ctx *Ctx) CurrentCacheConfig() (pconfig FuncCacheConfig, err error)
- func (ctx *Ctx) CurrentDevice() (device Device, err error)
- func (ctx *Ctx) CurrentFlags() (flags ContextFlags, err error)
- func (ctx *Ctx) Descriptor(hArray Array) (pArrayDescriptor ArrayDesc, err error)
- func (ctx *Ctx) Descriptor3(hArray Array) (pArrayDescriptor Array3Desc, err error)
- func (ctx *Ctx) DestroyArray(hArray Array)
- func (ctx *Ctx) DestroyEvent(event *Event)
- func (ctx *Ctx) DestroyStream(hStream *Stream)
- func (ctx *Ctx) DisablePeerAccess(peerContext CUContext)
- func (ctx *Ctx) Do(fn func() error) error
- func (ctx *Ctx) Elapsed(hStart Event, hEnd Event) (pMilliseconds float64, err error)
- func (ctx *Ctx) EnablePeerAccess(peerContext CUContext, Flags uint)
- func (ctx *Ctx) ErrChan() chan error
- func (ctx *Ctx) Error() error
- func (ctx *Ctx) FilterMode(hTexRef TexRef) (pfm FilterMode, err error)
- func (ctx *Ctx) Format(hTexRef TexRef) (pFormat Format, pNumChannels int, err error)
- func (ctx *Ctx) FunctionAttribute(fn Function, attrib FunctionAttribute) (pi int, err error)
- func (ctx *Ctx) GetArray(hSurfRef SurfRef) (phArray Array, err error)
- func (ctx *Ctx) LaunchKernel(fn Function, gridDimX, gridDimY, gridDimZ int, ...)
- func (ctx *Ctx) Limits(limit Limit) (pvalue int64, err error)
- func (ctx *Ctx) Load(name string) (m Module, err error)
- func (ctx *Ctx) MakeEvent(flags EventFlags) (event Event, err error)
- func (ctx *Ctx) MakeStream(flags StreamFlags) (stream Stream, err error)
- func (ctx *Ctx) MakeStreamWithPriority(priority int, flags StreamFlags) (Stream, error)
- func (ctx *Ctx) MaxAnisotropy(hTexRef TexRef) (pmaxAniso int, err error)
- func (ctx *Ctx) MemAlloc(bytesize int64) (dptr DevicePtr, err error)
- func (ctx *Ctx) MemAllocManaged(bytesize int64, flags MemAttachFlags) (dptr DevicePtr, err error)
- func (ctx *Ctx) MemAllocPitch(WidthInBytes int64, Height int64, ElementSizeBytes uint) (dptr DevicePtr, pPitch int64, err error)
- func (ctx *Ctx) MemFree(dptr DevicePtr)
- func (ctx *Ctx) MemFreeHost(p unsafe.Pointer)
- func (ctx *Ctx) MemInfo() (free int64, total int64, err error)
- func (ctx *Ctx) Memcpy(dst DevicePtr, src DevicePtr, ByteCount int64)
- func (ctx *Ctx) Memcpy2D(pCopy Memcpy2dParam)
- func (ctx *Ctx) Memcpy2DAsync(pCopy Memcpy2dParam, hStream Stream)
- func (ctx *Ctx) Memcpy2DUnaligned(pCopy Memcpy2dParam)
- func (ctx *Ctx) Memcpy3D(pCopy Memcpy3dParam)
- func (ctx *Ctx) Memcpy3DAsync(pCopy Memcpy3dParam, hStream Stream)
- func (ctx *Ctx) Memcpy3DPeer(pCopy Memcpy3dPeerParam)
- func (ctx *Ctx) Memcpy3DPeerAsync(pCopy Memcpy3dPeerParam, hStream Stream)
- func (ctx *Ctx) MemcpyAsync(dst DevicePtr, src DevicePtr, ByteCount int64, hStream Stream)
- func (ctx *Ctx) MemcpyAtoA(dstArray Array, dstOffset int64, srcArray Array, srcOffset int64, ...)
- func (ctx *Ctx) MemcpyAtoD(dstDevice DevicePtr, srcArray Array, srcOffset int64, ByteCount int64)
- func (ctx *Ctx) MemcpyAtoH(dstHost unsafe.Pointer, srcArray Array, srcOffset int64, ByteCount int64)
- func (ctx *Ctx) MemcpyAtoHAsync(dstHost unsafe.Pointer, srcArray Array, srcOffset int64, ByteCount int64, ...)
- func (ctx *Ctx) MemcpyDtoA(dstArray Array, dstOffset int64, srcDevice DevicePtr, ByteCount int64)
- func (ctx *Ctx) MemcpyDtoD(dstDevice DevicePtr, srcDevice DevicePtr, ByteCount int64)
- func (ctx *Ctx) MemcpyDtoDAsync(dstDevice DevicePtr, srcDevice DevicePtr, ByteCount int64, hStream Stream)
- func (ctx *Ctx) MemcpyDtoH(dstHost unsafe.Pointer, srcDevice DevicePtr, ByteCount int64)
- func (ctx *Ctx) MemcpyDtoHAsync(dstHost unsafe.Pointer, srcDevice DevicePtr, ByteCount int64, hStream Stream)
- func (ctx *Ctx) MemcpyHtoA(dstArray Array, dstOffset int64, srcHost unsafe.Pointer, ByteCount int64)
- func (ctx *Ctx) MemcpyHtoAAsync(dstArray Array, dstOffset int64, srcHost unsafe.Pointer, ByteCount int64, ...)
- func (ctx *Ctx) MemcpyHtoD(dstDevice DevicePtr, srcHost unsafe.Pointer, ByteCount int64)
- func (ctx *Ctx) MemcpyHtoDAsync(dstDevice DevicePtr, srcHost unsafe.Pointer, ByteCount int64, hStream Stream)
- func (ctx *Ctx) MemcpyPeer(dstDevice DevicePtr, dstContext CUContext, srcDevice DevicePtr, ...)
- func (ctx *Ctx) MemcpyPeerAsync(dstDevice DevicePtr, dstContext CUContext, srcDevice DevicePtr, ...)
- func (ctx *Ctx) MemsetD16(dstDevice DevicePtr, us uint16, N int64)
- func (ctx *Ctx) MemsetD16Async(dstDevice DevicePtr, us uint16, N int64, hStream Stream)
- func (ctx *Ctx) MemsetD2D16(dstDevice DevicePtr, dstPitch int64, us uint16, Width int64, Height int64)
- func (ctx *Ctx) MemsetD2D16Async(dstDevice DevicePtr, dstPitch int64, us uint16, Width int64, Height int64, ...)
- func (ctx *Ctx) MemsetD2D32(dstDevice DevicePtr, dstPitch int64, ui uint, Width int64, Height int64)
- func (ctx *Ctx) MemsetD2D32Async(dstDevice DevicePtr, dstPitch int64, ui uint, Width int64, Height int64, ...)
- func (ctx *Ctx) MemsetD2D8(dstDevice DevicePtr, dstPitch int64, uc byte, Width int64, Height int64)
- func (ctx *Ctx) MemsetD2D8Async(dstDevice DevicePtr, dstPitch int64, uc byte, Width int64, Height int64, ...)
- func (ctx *Ctx) MemsetD32(dstDevice DevicePtr, ui uint, N int64)
- func (ctx *Ctx) MemsetD32Async(dstDevice DevicePtr, ui uint, N int64, hStream Stream)
- func (ctx *Ctx) MemsetD8(dstDevice DevicePtr, uc byte, N int64)
- func (ctx *Ctx) MemsetD8Async(dstDevice DevicePtr, uc byte, N int64, hStream Stream)
- func (ctx *Ctx) ModuleFunction(m Module, name string) (function Function, err error)
- func (ctx *Ctx) ModuleGlobal(m Module, name string) (dptr DevicePtr, size int64, err error)
- func (ctx *Ctx) ModuleSurfRef(mod Module, name string) (SurfRef, error)
- func (ctx *Ctx) ModuleTexRef(mod Module, name string) (TexRef, error)
- func (ctx *Ctx) Priority(hStream Stream) (priority int, err error)
- func (ctx *Ctx) QueryEvent(hEvent Event)
- func (ctx *Ctx) QueryStream(hStream Stream)
- func (ctx *Ctx) Record(hEvent Event, hStream Stream)
- func (ctx *Ctx) ResetL2Cache()
- func (ctx *Ctx) Run(errChan chan error) error
- func (ctx *Ctx) SetAddress(hTexRef TexRef, dptr DevicePtr, bytes int64) (ByteOffset int64, err error)
- func (ctx *Ctx) SetAddress2D(hTexRef TexRef, desc ArrayDesc, dptr DevicePtr, Pitch int64)
- func (ctx *Ctx) SetAddressMode(hTexRef TexRef, dim int, am AddressMode)
- func (ctx *Ctx) SetBorderColor(hTexRef TexRef, pBorderColor [3]float32)
- func (ctx *Ctx) SetCacheConfig(fn Function, config FuncCacheConfig)
- func (ctx *Ctx) SetCurrentCacheConfig(config FuncCacheConfig)
- func (ctx *Ctx) SetFilterMode(hTexRef TexRef, fm FilterMode)
- func (ctx *Ctx) SetFormat(hTexRef TexRef, fmt Format, NumPackedComponents int)
- func (ctx *Ctx) SetFunctionSharedMemConfig(fn Function, config SharedConfig)
- func (ctx *Ctx) SetLimit(limit Limit, value int64)
- func (ctx *Ctx) SetMaxAnisotropy(hTexRef TexRef, maxAniso uint)
- func (ctx *Ctx) SetMipmapFilterMode(hTexRef TexRef, fm FilterMode)
- func (ctx *Ctx) SetMipmapLevelBias(hTexRef TexRef, bias float64)
- func (ctx *Ctx) SetMipmapLevelClamp(hTexRef TexRef, minMipmapLevelClamp float64, maxMipmapLevelClamp float64)
- func (ctx *Ctx) SetSharedMemConfig(config SharedConfig)
- func (ctx *Ctx) SetTexRefFlags(hTexRef TexRef, Flags TexRefFlags)
- func (ctx *Ctx) SharedMemConfig() (pConfig SharedConfig, err error)
- func (ctx *Ctx) StreamFlags(hStream Stream) (flags uint, err error)
- func (ctx *Ctx) StreamPriorityRange() (leastPriority int, greatestPriority int, err error)
- func (ctx *Ctx) SurfRefSetArray(hSurfRef SurfRef, hArray Array, Flags uint)
- func (ctx *Ctx) Synchronize()
- func (ctx *Ctx) SynchronizeEvent(hEvent Event)
- func (ctx *Ctx) SynchronizeStream(hStream Stream)
- func (ctx *Ctx) TexRefFlags(hTexRef TexRef) (pFlags uint, err error)
- func (ctx *Ctx) TexRefSetArray(hTexRef TexRef, hArray Array, Flags uint)
- func (ctx *Ctx) Unload(hmod Module)
- func (ctx *Ctx) Wait(hStream Stream, hEvent Event, Flags uint)
- func (ctx *Ctx) WaitOnValue32(stream Stream, addr DevicePtr, value uint32, flags uint)
- func (ctx *Ctx) Work() <-chan func() error
- func (ctx *Ctx) WriteValue32(stream Stream, addr DevicePtr, value uint32, flags uint)
- type Device
- func (dev Device) Attribute(attrib DeviceAttribute) (pi int, err error)
- func (dev Device) Attributes(attrs ...DeviceAttribute) ([]int, error)
- func (dev Device) CanAccessPeer(peerDev Device) (canAccessPeer int, err error)
- func (d Device) ComputeCapability() (major, minor int, err error)
- func (d Device) IsGPU() bool
- func (d Device) MakeContext(flags ContextFlags) (CUContext, error)
- func (d Device) Name() (string, error)
- func (srcDevice Device) P2PAttribute(attrib P2PAttribute, dstDevice Device) (value int, err error)
- func (dev Device) PrimaryCtxState() (flags ContextFlags, active int, err error)
- func (dev Device) ReleasePrimaryCtx() (err error)
- func (dev Device) ResetPrimaryCtx() (err error)
- func (d Device) RetainPrimaryCtx() (primaryContext CUContext, err error)
- func (dev Device) SetPrimaryCtxFlags(flags ContextFlags) (err error)
- func (d Device) String() string
- func (dev Device) TotalMem() (bytes int64, err error)
- func (d Device) UUID() (retVal uuid.UUID, err error)
- type DeviceAttribute
- type DevicePtr
- func AllocAndCopy(p unsafe.Pointer, bytesize int64) (DevicePtr, error)
- func MemAlloc(bytesize int64) (dptr DevicePtr, err error)
- func MemAllocManaged(bytesize int64, flags MemAttachFlags) (dptr DevicePtr, err error)
- func MemAllocPitch(WidthInBytes int64, Height int64, ElementSizeBytes uint) (dptr DevicePtr, pPitch int64, err error)
- func (d DevicePtr) AddressRange() (size int64, base DevicePtr, err error)
- func (d DevicePtr) IsCUDAMemory() bool
- func (d DevicePtr) MemAdvise(count int64, advice MemAdvice, dev Device) error
- func (d DevicePtr) MemPrefetchAsync(count int64, dst Device, hStream Stream) error
- func (mem DevicePtr) MemSize() uintptr
- func (mem DevicePtr) MemoryType() (typ MemoryType, err error)
- func (d DevicePtr) PtrAttribute(attr PointerAttribute) (unsafe.Pointer, error)
- func (d DevicePtr) SetPtrAttribute(value unsafe.Pointer, attr PointerAttribute) error
- func (d DevicePtr) String() string
- func (d DevicePtr) Uintptr() uintptr
- type ErrorLister
- type Event
- type EventFlags
- type ExecGraph
- type FilterMode
- type Format
- type FuncCacheConfig
- type Function
- func (fn Function) Attribute(attrib FunctionAttribute) (pi int, err error)
- func (fn Function) Launch(gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, ...) error
- func (fn Function) LaunchAndSync(...) error
- func (fn Function) MaxActiveBlocksPerMultiProcessor(blockSize int, dynamicSmemSize int64) (int, error)
- func (fn Function) MaxActiveBlocksPerMultiProcessorWithFlags(blockSize int, dynamicSmemSize int64, flags OccupancyFlags) (int, error)
- func (fn Function) SetCacheConfig(config FuncCacheConfig) (err error)
- func (fn Function) SetSharedMemConfig(config SharedConfig) (err error)
- type FunctionAttribute
- type Graph
- func (g Graph) AddDependencies(from, to []Node) error
- func (g Graph) AddEmptyNode(children []Node) (Node, error)
- func (g Graph) AddHostNode(children []Node, params *HostNodeParams) (Node, error)
- func (g Graph) AddKernelNode(children []Node, params *KernelNodeParams) (Node, error)
- func (g Graph) AddMemcpyNode(children []Node, params *CopyParams, ctx Context) (Node, error)
- func (g Graph) AddMemsetNode(children []Node, params *MemsetParams, ctx Context) (Node, error)
- func (g Graph) Clone() (Graph, error)
- func (g Graph) Destroy() error
- func (g Graph) Edges(from, to []Node) (edges []int, numEdges int, err error)
- func (g Graph) String() string
- type HostFunction
- type HostNodeParams
- type JITCacheMode
- type JITCacheModeOption
- type JITErrorLogBuffer
- type JITFallbackOption
- type JITFallbackStrategy
- type JITGenerateDebugInfo
- type JITGenerateLineInfo
- type JITInfoLogBuffer
- type JITInputType
- type JITLogVerbose
- type JITMaxRegisters
- type JITOptimizationLevel
- type JITOption
- type JITTarget
- type JITTargetFromContext
- type JITTargetOption
- type JITThreadsPerBlock
- type JITWallTime
- type KernelNodeParams
- type Limit
- type LinkState
- type MemAdvice
- type MemAttachFlags
- type Memcpy2dParam
- type Memcpy3dParam
- type Memcpy3dPeerParam
- type MemoryType
- type MemsetParams
- type Module
- type Node
- type OccupancyFlags
- type P2PAttribute
- type PointerAttribute
- type SharedConfig
- type Stream
- func (hStream Stream) AttachMemAsync(dptr DevicePtr, length int64, flags uint) (err error)
- func (s Stream) C() C.CUstream
- func (hStream *Stream) Destroy() error
- func (hStream Stream) Flags() (flags StreamFlags, err error)
- func (hStream Stream) Priority() (priority int, err error)
- func (hStream Stream) Query() (err error)
- func (hStream Stream) Synchronize() (err error)
- func (hStream Stream) Wait(hEvent Event, Flags uint) (err error)
- func (stream Stream) WaitOnValue32(addr DevicePtr, value uint32, flags uint) (err error)
- func (stream Stream) WriteValue32(addr DevicePtr, value uint32, flags uint) (err error)
- type StreamFlags
- type SurfRef
- type TexRef
- func (hTexRef TexRef) Address() (pdptr DevicePtr, err error)
- func (hTexRef TexRef) AddressMode(dim int) (pam AddressMode, err error)
- func (hTexRef TexRef) Array() (phArray Array, err error)
- func (hTexRef TexRef) BorderColor() (pBorderColor [3]float32, err error)
- func (hTexRef TexRef) FilterMode() (pfm FilterMode, err error)
- func (hTexRef TexRef) Flags() (pFlags TexRefFlags, err error)
- func (hTexRef TexRef) Format() (pFormat Format, pNumChannels int, err error)
- func (hTexRef TexRef) MaxAnisotropy() (pmaxAniso int, err error)
- func (hTexRef TexRef) SetAddress(dptr DevicePtr, bytes int64) (ByteOffset int64, err error)
- func (hTexRef TexRef) SetAddress2D(desc ArrayDesc, dptr DevicePtr, Pitch int64) (err error)
- func (hTexRef TexRef) SetAddressMode(dim int, am AddressMode) (err error)
- func (hTexRef TexRef) SetArray(hArray Array, Flags uint) (err error)
- func (hTexRef TexRef) SetBorderColor(pBorderColor [3]float32) (err error)
- func (hTexRef TexRef) SetFilterMode(fm FilterMode) (err error)
- func (hTexRef TexRef) SetFlags(Flags TexRefFlags) (err error)
- func (hTexRef TexRef) SetFormat(fmt Format, NumPackedComponents int) (err error)
- func (hTexRef TexRef) SetMaxAnisotropy(maxAniso uint) (err error)
- func (hTexRef TexRef) SetMipmapFilterMode(fm FilterMode) (err error)
- func (hTexRef TexRef) SetMipmapLevelBias(bias float64) (err error)
- func (hTexRef TexRef) SetMipmapLevelClamp(minMipmapLevelClamp float64, maxMipmapLevelClamp float64) (err error)
- type TexRefFlags
Constants ¶
const ( Success cuResult = C.CUDA_SUCCESS InvalidValue cuResult = C.CUDA_ERROR_INVALID_VALUE OutOfMemory cuResult = C.CUDA_ERROR_OUT_OF_MEMORY NotInitialized cuResult = C.CUDA_ERROR_NOT_INITIALIZED Deinitialized cuResult = C.CUDA_ERROR_DEINITIALIZED ProfilerDisabled cuResult = C.CUDA_ERROR_PROFILER_DISABLED ProfilerNotInitialized cuResult = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED ProfilerAlreadyStarted cuResult = C.CUDA_ERROR_PROFILER_ALREADY_STARTED ProfilerAlreadyStopped cuResult = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED NoDevice cuResult = C.CUDA_ERROR_NO_DEVICE InvalidDevice cuResult = C.CUDA_ERROR_INVALID_DEVICE InvalidImage cuResult = C.CUDA_ERROR_INVALID_IMAGE InvalidContext cuResult = C.CUDA_ERROR_INVALID_CONTEXT ContextAlreadyCurrent cuResult = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT MapFailed cuResult = C.CUDA_ERROR_MAP_FAILED UnmapFailed cuResult = C.CUDA_ERROR_UNMAP_FAILED ArrayIsMapped cuResult = C.CUDA_ERROR_ARRAY_IS_MAPPED AlreadyMapped cuResult = C.CUDA_ERROR_ALREADY_MAPPED NoBinaryForGpu cuResult = C.CUDA_ERROR_NO_BINARY_FOR_GPU AlreadyAcquired cuResult = C.CUDA_ERROR_ALREADY_ACQUIRED NotMapped cuResult = C.CUDA_ERROR_NOT_MAPPED NotMappedAsArray cuResult = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY NotMappedAsPointer cuResult = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER EccUncorrectable cuResult = C.CUDA_ERROR_ECC_UNCORRECTABLE UnsupportedLimit cuResult = C.CUDA_ERROR_UNSUPPORTED_LIMIT ContextAlreadyInUse cuResult = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE PeerAccessUnsupported cuResult = C.CUDA_ERROR_PEER_ACCESS_UNSUPPORTED InvalidPtx cuResult = C.CUDA_ERROR_INVALID_PTX InvalidGraphicsContext cuResult = C.CUDA_ERROR_INVALID_GRAPHICS_CONTEXT NvlinkUncorrectable cuResult = C.CUDA_ERROR_NVLINK_UNCORRECTABLE JitCompilerNotFound cuResult = C.CUDA_ERROR_JIT_COMPILER_NOT_FOUND InvalidSource cuResult = C.CUDA_ERROR_INVALID_SOURCE FileNotFound cuResult = C.CUDA_ERROR_FILE_NOT_FOUND OperatingSystem cuResult = C.CUDA_ERROR_OPERATING_SYSTEM InvalidHandle cuResult = C.CUDA_ERROR_INVALID_HANDLE IllegalState cuResult = C.CUDA_ERROR_ILLEGAL_STATE NotFound cuResult = C.CUDA_ERROR_NOT_FOUND NotReady cuResult = C.CUDA_ERROR_NOT_READY IllegalAddress cuResult = C.CUDA_ERROR_ILLEGAL_ADDRESS LaunchOutOfResources cuResult = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES LaunchTimeout cuResult = C.CUDA_ERROR_LAUNCH_TIMEOUT LaunchIncompatibleTexturing cuResult = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING PeerAccessAlreadyEnabled cuResult = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED PeerAccessNotEnabled cuResult = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED PrimaryContextActive cuResult = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE ContextIsDestroyed cuResult = C.CUDA_ERROR_CONTEXT_IS_DESTROYED Assert cuResult = C.CUDA_ERROR_ASSERT TooManyPeers cuResult = C.CUDA_ERROR_TOO_MANY_PEERS HostMemoryAlreadyRegistered cuResult = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED HostMemoryNotRegistered cuResult = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED HardwareStackError cuResult = C.CUDA_ERROR_HARDWARE_STACK_ERROR IllegalInstruction cuResult = C.CUDA_ERROR_ILLEGAL_INSTRUCTION MisalignedAddress cuResult = C.CUDA_ERROR_MISALIGNED_ADDRESS InvalidAddressSpace cuResult = C.CUDA_ERROR_INVALID_ADDRESS_SPACE InvalidPc cuResult = C.CUDA_ERROR_INVALID_PC LaunchFailed cuResult = C.CUDA_ERROR_LAUNCH_FAILED CooperativeLaunchTooLarge cuResult = C.CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE NotPermitted cuResult = C.CUDA_ERROR_NOT_PERMITTED NotSupported cuResult = C.CUDA_ERROR_NOT_SUPPORTED SystemNotReady cuResult = C.CUDA_ERROR_SYSTEM_NOT_READY SystemDriverMismatch cuResult = C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH CompatNotSupportedOnDevice cuResult = C.CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE StreamCaptureUnsupported cuResult = C.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED StreamCaptureInvalidated cuResult = C.CUDA_ERROR_STREAM_CAPTURE_INVALIDATED StreamCaptureMerge cuResult = C.CUDA_ERROR_STREAM_CAPTURE_MERGE StreamCaptureUnmatched cuResult = C.CUDA_ERROR_STREAM_CAPTURE_UNMATCHED StreamCaptureUnjoined cuResult = C.CUDA_ERROR_STREAM_CAPTURE_UNJOINED StreamCaptureIsolation cuResult = C.CUDA_ERROR_STREAM_CAPTURE_ISOLATION StreamCaptureImplicit cuResult = C.CUDA_ERROR_STREAM_CAPTURE_IMPLICIT CapturedEvent cuResult = C.CUDA_ERROR_CAPTURED_EVENT StreamCaptureWrongThread cuResult = C.CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD Timeout cuResult = C.CUDA_ERROR_TIMEOUT GraphExecUpdateFailure cuResult = C.CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE Unknown cuResult = C.CUDA_ERROR_UNKNOWN )
const DEBUG = false
Variables ¶
var NoStream = Stream{}
Functions ¶
func AverageQueueLength ¶
func AverageQueueLength() int
AverageQueueLength returns the average queue length recorded. This allows for optimizations.
func BlockingCallers ¶
func DestroyEvent ¶
func LauncCooperativeKernelMultiDevice ¶ added in v0.9.4
func LauncCooperativeKernelMultiDevice()
STUB
func MemFreeHost ¶
func Memcpy2D ¶
func Memcpy2D(pCopy Memcpy2dParam) (err error)
func Memcpy2DAsync ¶
func Memcpy2DAsync(pCopy Memcpy2dParam, hStream Stream) (err error)
func Memcpy2DUnaligned ¶
func Memcpy2DUnaligned(pCopy Memcpy2dParam) (err error)
func Memcpy3D ¶
func Memcpy3D(pCopy Memcpy3dParam) (err error)
func Memcpy3DAsync ¶
func Memcpy3DAsync(pCopy Memcpy3dParam, hStream Stream) (err error)
func Memcpy3DPeer ¶
func Memcpy3DPeer(pCopy Memcpy3dPeerParam) (err error)
func Memcpy3DPeerAsync ¶
func Memcpy3DPeerAsync(pCopy Memcpy3dPeerParam, hStream Stream) (err error)
func MemcpyAsync ¶
func MemcpyAtoA ¶
func MemcpyAtoD ¶
func MemcpyAtoH ¶
func MemcpyAtoHAsync ¶
func MemcpyDtoA ¶
func MemcpyDtoD ¶
func MemcpyDtoDAsync ¶
func MemcpyDtoH ¶
func MemcpyDtoHAsync ¶
func MemcpyHtoA ¶
func MemcpyHtoAAsync ¶
func MemcpyHtoD ¶
func MemcpyHtoDAsync ¶
func MemcpyPeer ¶
func MemcpyPeerAsync ¶
func MemsetD16Async ¶
func MemsetD2D16 ¶
func MemsetD2D16Async ¶
func MemsetD2D32 ¶
func MemsetD2D32Async ¶
func MemsetD2D8 ¶
func MemsetD2D8Async ¶
func MemsetD32Async ¶
func MemsetD8Async ¶
func NumDevices ¶
func PushCurrentCtx ¶
func RegisterFunc ¶ added in v0.9.4
func RegisterFunc(fn HostFunction) unsafe.Pointer
RegisterFunc is used to register a Go based callback such that it may be called by CUDA.
func SetCurrentCacheConfig ¶
func SetCurrentCacheConfig(config FuncCacheConfig) (err error)
func SetCurrentContext ¶
func SetSharedMemConfig ¶
func SetSharedMemConfig(config SharedConfig) (err error)
func StreamPriorityRange ¶
func Synchronize ¶
func Synchronize() (err error)
Types ¶
type AddressMode ¶
type AddressMode byte
AddressMode are texture reference addressing modes
const ( WrapMode AddressMode = C.CU_TR_ADDRESS_MODE_WRAP // Wrapping address mode ClampMode AddressMode = C.CU_TR_ADDRESS_MODE_CLAMP // Clamp to edge address mode MirrorMode AddressMode = C.CU_TR_ADDRESS_MODE_MIRROR // Mirror address mode BorderMode AddressMode = C.CU_TR_ADDRESS_MODE_BORDER // Border address mode )
type Array ¶
type Array struct {
// contains filtered or unexported fields
}
Array is the pointer to a CUDA array. The name is a bit of a misnomer, as it would lead one to imply that it's rangeable. It's not.
func Make3DArray ¶
func Make3DArray(pAllocateArray Array3Desc) (pHandle Array, err error)
func (Array) Descriptor ¶
func (Array) Descriptor3 ¶
func (hArray Array) Descriptor3() (pArrayDescriptor Array3Desc, err error)
type Array3Desc ¶
Array3Desc is the descriptor for CUDA 3D arrays, which is used to determine what to allocate.
From the docs:
Width, Height, and Depth are the width, height, and depth of the CUDA array (in elements); the following types of CUDA arrays can be allocated: - A 1D array is allocated if Height and Depth extents are both zero. - A 2D array is allocated if only Depth extent is zero. - A 3D array is allocated if all three extents are non-zero. - A 1D layered CUDA array is allocated if only Height is zero and the CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number of layers is determined by the depth extent. - A 2D layered CUDA array is allocated if all three extents are non-zero and the CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number of layers is determined by the depth extent. - A cubemap CUDA array is allocated if all three extents are non-zero and the CUDA_ARRAY3D_CUBEMAP flag is set. Width must be equal to Height, and Depth must be six. A cubemap is a special type of 2D layered CUDA array, where the six layers represent the six faces of a cube. The order of the six layers in memory is the same as that listed in CUarray_cubemap_face. - A cubemap layered CUDA array is allocated if all three extents are non-zero, and both, CUDA_ARRAY3D_CUBEMAP and CUDA_ARRAY3D_LAYERED flags are set. Width must be equal to Height, and Depth must be a multiple of six. A cubemap layered CUDA array is a special type of 2D layered CUDA array that consists of a collection of cubemaps. The first six layers represent the first cubemap, the next six layers form the second cubemap, and so on.
type ArrayDesc ¶
ArrayDesc is the descriptor for CUDA arrays, which is used to determine what to allocate.
From the docs:
Width, and Height are the width, and height of the CUDA array (in elements); the CUDA array is one-dimensional if height is 0, two-dimensional otherwise;
type BatchedContext ¶
BatchedContext is a CUDA context where the CUDA calls are batched up.
Typically a locked OS thread is made to execute the CUDA calls like so:
func main() { ctx := NewBatchedContext(...) runtime.LockOSThread() defer runtime.UnlockOSThread() workAvailable := ctx.WorkAvailable() go doWhatever(ctx) for { select { case <- workAvailable: ctx.DoWork() err := ctx.Errors() handleErrors(err) case ...: } } } func doWhatever(ctx *BatchedContext) { ctx.Memcpy(...) // et cetera // et cetera }
For the moment, BatchedContext only supports a limited number of CUDA Runtime APIs. Feel free to send a pull request with more APIs.
func NewBatchedContext ¶
func NewBatchedContext(c Context, d Device) *BatchedContext
NewBatchedContext creates a batched CUDA context.
func (*BatchedContext) AllocAndCopy ¶
func (*BatchedContext) Cleanup ¶
func (ctx *BatchedContext) Cleanup()
Cleanup is the cleanup function. It cleans up all the ancilliary allocations that has happened for all the batched calls. This method should be called when the context is done with - otherwise there'd be a lot of leaked memory.
The main reason why this method exists is because there is no way to reliably free memory without causing weird issues in the CUDA calls.
func (*BatchedContext) Close ¶ added in v0.9.1
func (ctx *BatchedContext) Close() error
Close closes the batched context
func (*BatchedContext) DoWork ¶
func (ctx *BatchedContext) DoWork()
DoWork waits for work to come in from the queue. If it's blocking, the entire queue will be processed immediately. Otherwise it will be added to the batch queue.
func (*BatchedContext) Errors ¶
func (ctx *BatchedContext) Errors() error
Errors returns any errors that may have occured during a batch processing
func (*BatchedContext) FirstError ¶
func (ctx *BatchedContext) FirstError() error
FirstError returns the first error if there was any
func (*BatchedContext) IsInitialized ¶ added in v0.9.1
func (ctx *BatchedContext) IsInitialized() bool
func (*BatchedContext) LaunchAndSync ¶
func (*BatchedContext) LaunchKernel ¶
func (*BatchedContext) MemAlloc ¶
func (ctx *BatchedContext) MemAlloc(bytesize int64) (retVal DevicePtr, err error)
MemAlloc allocates memory. It is a blocking call.
func (*BatchedContext) MemAllocManaged ¶
func (ctx *BatchedContext) MemAllocManaged(bytesize int64, flags MemAttachFlags) (retVal DevicePtr, err error)
func (*BatchedContext) MemFree ¶
func (ctx *BatchedContext) MemFree(mem DevicePtr)
func (*BatchedContext) MemFreeHost ¶
func (ctx *BatchedContext) MemFreeHost(p unsafe.Pointer)
func (*BatchedContext) Memcpy ¶
func (ctx *BatchedContext) Memcpy(dst, src DevicePtr, byteCount int64)
func (*BatchedContext) MemcpyDtoH ¶
func (ctx *BatchedContext) MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, byteCount int64)
func (*BatchedContext) MemcpyHtoD ¶
func (ctx *BatchedContext) MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, byteCount int64)
func (*BatchedContext) Run ¶
func (ctx *BatchedContext) Run(errChan chan error) error
Run manages the running of the BatchedContext. Because it's expected to run in a goroutine, an error channel is to be passed in
func (*BatchedContext) SetCurrent ¶
func (ctx *BatchedContext) SetCurrent()
SetCurrent sets the current context. This is usually unnecessary because SetCurrent will be called before batch processing the calls.
func (*BatchedContext) Signal ¶ added in v0.9.4
func (ctx *BatchedContext) Signal()
Signal is used to tell the context that work is available
func (*BatchedContext) Synchronize ¶
func (ctx *BatchedContext) Synchronize()
func (*BatchedContext) WorkAvailable ¶
func (ctx *BatchedContext) WorkAvailable() <-chan struct{}
WorkAvailable returns the chan where work availability is broadcasted on.
type CUContext ¶
type CUContext struct {
// contains filtered or unexported fields
}
CUContext is a CUDA context
func CurrentContext ¶
func PopCurrentCtx ¶
func (CUContext) APIVersion ¶
func (*CUContext) Destroy ¶ added in v0.9.1
Destroy destroys the context. It returns an error if it wasn't properly destroyed
Wrapper over cuCtxDestroy: http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
func (CUContext) DisablePeerAccess ¶
func (CUContext) EnablePeerAccess ¶
func (CUContext) Lock ¶
Lock ties the calling goroutine to an OS thread, then ties the CUDA context to the thread. Do not call in a goroutine.
Good:
func main() { dev, _ := GetDevice(0) ctx, _ := dev.MakeContext() if err := ctx.Lock(); err != nil{ // handle error } mem, _ := MemAlloc(1024) }
Bad:
func main() { dev, _ := GetDevice(0) ctx, _ := dev.MakeContext() go ctx.Lock() // this will tie the goroutine that calls ctx.Lock to the OS thread, while the main thread does not get the lock mem, _ := MemAlloc(1024) }
func (*CUContext) ResetL2Cache ¶ added in v0.9.4
ResetL2Cache resets all persisting lines in cache to normal status Use only if your device suports it.
type Context ¶
type Context interface { // Operational stuff CUDAContext() CUContext Error() error Run(chan error) error Do(fn func() error) error Work() <-chan func() error ErrChan() chan error Close() error // Close closes all resources associated with the context // actual methods Address(hTexRef TexRef) (pdptr DevicePtr, err error) AddressMode(hTexRef TexRef, dim int) (pam AddressMode, err error) Array(hTexRef TexRef) (phArray Array, err error) AttachMemAsync(hStream Stream, dptr DevicePtr, length int64, flags uint) BorderColor(hTexRef TexRef) (pBorderColor [3]float32, err error) CurrentCacheConfig() (pconfig FuncCacheConfig, err error) CurrentDevice() (device Device, err error) CurrentFlags() (flags ContextFlags, err error) Descriptor(hArray Array) (pArrayDescriptor ArrayDesc, err error) Descriptor3(hArray Array) (pArrayDescriptor Array3Desc, err error) DestroyArray(hArray Array) DestroyEvent(event *Event) DestroyStream(hStream *Stream) DisablePeerAccess(peerContext CUContext) Elapsed(hStart Event, hEnd Event) (pMilliseconds float64, err error) EnablePeerAccess(peerContext CUContext, Flags uint) FilterMode(hTexRef TexRef) (pfm FilterMode, err error) Format(hTexRef TexRef) (pFormat Format, pNumChannels int, err error) FunctionAttribute(fn Function, attrib FunctionAttribute) (pi int, err error) GetArray(hSurfRef SurfRef) (phArray Array, err error) LaunchKernel(fn Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) Limits(limit Limit) (pvalue int64, err error) Load(name string) (m Module, err error) MakeEvent(flags EventFlags) (event Event, err error) MakeStream(flags StreamFlags) (stream Stream, err error) MakeStreamWithPriority(priority int, flags StreamFlags) (stream Stream, err error) MaxAnisotropy(hTexRef TexRef) (pmaxAniso int, err error) MemAlloc(bytesize int64) (dptr DevicePtr, err error) MemAllocManaged(bytesize int64, flags MemAttachFlags) (dptr DevicePtr, err error) MemAllocPitch(WidthInBytes int64, Height int64, ElementSizeBytes uint) (dptr DevicePtr, pPitch int64, err error) MemFree(dptr DevicePtr) MemFreeHost(p unsafe.Pointer) MemInfo() (free int64, total int64, err error) Memcpy(dst DevicePtr, src DevicePtr, ByteCount int64) Memcpy2D(pCopy Memcpy2dParam) Memcpy2DAsync(pCopy Memcpy2dParam, hStream Stream) Memcpy2DUnaligned(pCopy Memcpy2dParam) Memcpy3D(pCopy Memcpy3dParam) Memcpy3DAsync(pCopy Memcpy3dParam, hStream Stream) Memcpy3DPeer(pCopy Memcpy3dPeerParam) Memcpy3DPeerAsync(pCopy Memcpy3dPeerParam, hStream Stream) MemcpyAsync(dst DevicePtr, src DevicePtr, ByteCount int64, hStream Stream) MemcpyAtoA(dstArray Array, dstOffset int64, srcArray Array, srcOffset int64, ByteCount int64) MemcpyAtoD(dstDevice DevicePtr, srcArray Array, srcOffset int64, ByteCount int64) MemcpyAtoH(dstHost unsafe.Pointer, srcArray Array, srcOffset int64, ByteCount int64) MemcpyAtoHAsync(dstHost unsafe.Pointer, srcArray Array, srcOffset int64, ByteCount int64, hStream Stream) MemcpyDtoA(dstArray Array, dstOffset int64, srcDevice DevicePtr, ByteCount int64) MemcpyDtoD(dstDevice DevicePtr, srcDevice DevicePtr, ByteCount int64) MemcpyDtoDAsync(dstDevice DevicePtr, srcDevice DevicePtr, ByteCount int64, hStream Stream) MemcpyDtoH(dstHost unsafe.Pointer, srcDevice DevicePtr, ByteCount int64) MemcpyDtoHAsync(dstHost unsafe.Pointer, srcDevice DevicePtr, ByteCount int64, hStream Stream) MemcpyHtoA(dstArray Array, dstOffset int64, srcHost unsafe.Pointer, ByteCount int64) MemcpyHtoAAsync(dstArray Array, dstOffset int64, srcHost unsafe.Pointer, ByteCount int64, hStream Stream) MemcpyHtoD(dstDevice DevicePtr, srcHost unsafe.Pointer, ByteCount int64) MemcpyHtoDAsync(dstDevice DevicePtr, srcHost unsafe.Pointer, ByteCount int64, hStream Stream) MemcpyPeer(dstDevice DevicePtr, dstContext CUContext, srcDevice DevicePtr, srcContext CUContext, ByteCount int64) MemcpyPeerAsync(dstDevice DevicePtr, dstContext CUContext, srcDevice DevicePtr, srcContext CUContext, ByteCount int64, hStream Stream) MemsetD16(dstDevice DevicePtr, us uint16, N int64) MemsetD16Async(dstDevice DevicePtr, us uint16, N int64, hStream Stream) MemsetD2D16(dstDevice DevicePtr, dstPitch int64, us uint16, Width int64, Height int64) MemsetD2D16Async(dstDevice DevicePtr, dstPitch int64, us uint16, Width int64, Height int64, hStream Stream) MemsetD2D32(dstDevice DevicePtr, dstPitch int64, ui uint, Width int64, Height int64) MemsetD2D32Async(dstDevice DevicePtr, dstPitch int64, ui uint, Width int64, Height int64, hStream Stream) MemsetD2D8(dstDevice DevicePtr, dstPitch int64, uc byte, Width int64, Height int64) MemsetD2D8Async(dstDevice DevicePtr, dstPitch int64, uc byte, Width int64, Height int64, hStream Stream) MemsetD32(dstDevice DevicePtr, ui uint, N int64) MemsetD32Async(dstDevice DevicePtr, ui uint, N int64, hStream Stream) MemsetD8(dstDevice DevicePtr, uc byte, N int64) MemsetD8Async(dstDevice DevicePtr, uc byte, N int64, hStream Stream) ModuleFunction(m Module, name string) (function Function, err error) ModuleGlobal(m Module, name string) (dptr DevicePtr, size int64, err error) Priority(hStream Stream) (priority int, err error) QueryEvent(hEvent Event) QueryStream(hStream Stream) Record(hEvent Event, hStream Stream) SetAddress(hTexRef TexRef, dptr DevicePtr, bytes int64) (ByteOffset int64, err error) SetAddress2D(hTexRef TexRef, desc ArrayDesc, dptr DevicePtr, Pitch int64) SetAddressMode(hTexRef TexRef, dim int, am AddressMode) SetBorderColor(hTexRef TexRef, pBorderColor [3]float32) SetCacheConfig(fn Function, config FuncCacheConfig) SetCurrentCacheConfig(config FuncCacheConfig) SetFilterMode(hTexRef TexRef, fm FilterMode) SetFormat(hTexRef TexRef, fmt Format, NumPackedComponents int) SetLimit(limit Limit, value int64) SetMaxAnisotropy(hTexRef TexRef, maxAniso uint) SetMipmapFilterMode(hTexRef TexRef, fm FilterMode) SetMipmapLevelBias(hTexRef TexRef, bias float64) SetMipmapLevelClamp(hTexRef TexRef, minMipmapLevelClamp float64, maxMipmapLevelClamp float64) SetTexRefFlags(hTexRef TexRef, Flags TexRefFlags) StreamFlags(hStream Stream) (flags uint, err error) StreamPriorityRange() (leastPriority int, greatestPriority int, err error) SurfRefSetArray(hSurfRef SurfRef, hArray Array, Flags uint) Synchronize() SynchronizeEvent(hEvent Event) SynchronizeStream(hStream Stream) TexRefFlags(hTexRef TexRef) (pFlags uint, err error) TexRefSetArray(hTexRef TexRef, hArray Array, Flags uint) Unload(hmod Module) Wait(hStream Stream, hEvent Event, Flags uint) WaitOnValue32(stream Stream, addr DevicePtr, value uint32, flags uint) WriteValue32(stream Stream, addr DevicePtr, value uint32, flags uint) }
Context interface. Typically you'd just embed *Ctx. Rarely do you need to use CUContext
type ContextFlags ¶
type ContextFlags byte
ContextFlags are flags that are used to create a context
const ( SchedAuto ContextFlags = C.CU_CTX_SCHED_AUTO // Automatic scheduling SchedSpin ContextFlags = C.CU_CTX_SCHED_SPIN // Set spin as default scheduling SchedYield ContextFlags = C.CU_CTX_SCHED_YIELD // Set yield as default scheduling SchedBlockingSync ContextFlags = C.CU_CTX_SCHED_BLOCKING_SYNC // Set blocking synchronization as default scheduling SchedMask ContextFlags = C.CU_CTX_SCHED_MASK // Mask for setting scheduling options for the flag MapHost ContextFlags = C.CU_CTX_MAP_HOST // Support mapped pinned allocations LMemResizeToMax ContextFlags = C.CU_CTX_LMEM_RESIZE_TO_MAX // Keep local memory allocation after launch FlagsMas ContextFlags = C.CU_CTX_FLAGS_MASK // Mask for setting other options to flags )
func CurrentFlags ¶
func CurrentFlags() (flags ContextFlags, err error)
type CopyParams ¶ added in v0.9.4
type CopyParams struct { SrcXInBytes uint64 SrcY uint64 SrcZ uint64 SrcLOD uint64 SrcType MemoryType SrcHost unsafe.Pointer SrcDevicePtr DevicePtr SrcArray Array Reserved0 unsafe.Pointer SrcPitch uint64 SrcHeight uint64 DstXInBytes uint64 DstY uint64 DstZ uint64 DstLOD uint64 DstType MemoryType DstHost unsafe.Pointer DstDevicePtr DevicePtr DstArray Array Reserved1 unsafe.Pointer DstPitch uint64 DstHeight uint64 WidthInBytes uint64 Height uint64 Depth uint64 }
type Ctx ¶
type Ctx struct { CUContext // contains filtered or unexported fields }
Ctx is a standalone CUDA Context that is threadlocked.
func CtxFromCUContext ¶
func CtxFromCUContext(d Device, cuctx CUContext, flags ContextFlags) *Ctx
CtxFromCUContext is another way of buildinga *Ctx.
Typical example:
cuctx, err := dev.MakeContext(SchedAuto) if err != nil { ..error handling.. } ctx := CtxFroMCUContext(d, cuctx)
func NewContext ¶
func NewContext(d Device, flags ContextFlags) *Ctx
NewContext creates a new context, and runs a listener locked to an OSThread. All work is piped through that goroutine
func NewManuallyManagedContext ¶
func NewManuallyManagedContext(d Device, flags ContextFlags) *Ctx
NewManuallyManagedContext creates a new context, but the Run() method which locks a goroutine to an OS thread, has to be manually run
func (*Ctx) AddressMode ¶
func (ctx *Ctx) AddressMode(hTexRef TexRef, dim int) (pam AddressMode, err error)
func (*Ctx) AttachMemAsync ¶
func (*Ctx) BorderColor ¶
func (*Ctx) CUDAContext ¶
CUDAContext returns the CUDA Context
func (*Ctx) CanAccessPeer ¶
func (*Ctx) Close ¶ added in v0.9.1
Close destroys the CUDA context and associated resources that has been created. Additionally, all channels of communications will be closed.
func (*Ctx) CurrentCacheConfig ¶
func (ctx *Ctx) CurrentCacheConfig() (pconfig FuncCacheConfig, err error)
func (*Ctx) CurrentDevice ¶
func (*Ctx) CurrentFlags ¶
func (ctx *Ctx) CurrentFlags() (flags ContextFlags, err error)
func (*Ctx) Descriptor ¶
func (*Ctx) Descriptor3 ¶
func (ctx *Ctx) Descriptor3(hArray Array) (pArrayDescriptor Array3Desc, err error)
func (*Ctx) DestroyArray ¶
func (*Ctx) DestroyEvent ¶
func (*Ctx) DestroyStream ¶
func (*Ctx) DisablePeerAccess ¶
func (*Ctx) EnablePeerAccess ¶
func (*Ctx) FilterMode ¶
func (ctx *Ctx) FilterMode(hTexRef TexRef) (pfm FilterMode, err error)
func (*Ctx) FunctionAttribute ¶
func (ctx *Ctx) FunctionAttribute(fn Function, attrib FunctionAttribute) (pi int, err error)
func (*Ctx) LaunchKernel ¶
func (*Ctx) MakeStream ¶
func (ctx *Ctx) MakeStream(flags StreamFlags) (stream Stream, err error)
func (*Ctx) MakeStreamWithPriority ¶
func (ctx *Ctx) MakeStreamWithPriority(priority int, flags StreamFlags) (Stream, error)
func (*Ctx) MaxAnisotropy ¶
func (*Ctx) MemAllocManaged ¶
func (ctx *Ctx) MemAllocManaged(bytesize int64, flags MemAttachFlags) (dptr DevicePtr, err error)
func (*Ctx) MemAllocPitch ¶
func (*Ctx) MemFreeHost ¶
func (*Ctx) Memcpy2D ¶
func (ctx *Ctx) Memcpy2D(pCopy Memcpy2dParam)
func (*Ctx) Memcpy2DAsync ¶
func (ctx *Ctx) Memcpy2DAsync(pCopy Memcpy2dParam, hStream Stream)
func (*Ctx) Memcpy2DUnaligned ¶
func (ctx *Ctx) Memcpy2DUnaligned(pCopy Memcpy2dParam)
func (*Ctx) Memcpy3D ¶
func (ctx *Ctx) Memcpy3D(pCopy Memcpy3dParam)
func (*Ctx) Memcpy3DAsync ¶
func (ctx *Ctx) Memcpy3DAsync(pCopy Memcpy3dParam, hStream Stream)
func (*Ctx) Memcpy3DPeer ¶
func (ctx *Ctx) Memcpy3DPeer(pCopy Memcpy3dPeerParam)
func (*Ctx) Memcpy3DPeerAsync ¶
func (ctx *Ctx) Memcpy3DPeerAsync(pCopy Memcpy3dPeerParam, hStream Stream)
func (*Ctx) MemcpyAsync ¶
func (*Ctx) MemcpyAtoA ¶
func (*Ctx) MemcpyAtoD ¶
func (*Ctx) MemcpyAtoH ¶
func (*Ctx) MemcpyAtoHAsync ¶
func (*Ctx) MemcpyDtoA ¶
func (*Ctx) MemcpyDtoD ¶
func (*Ctx) MemcpyDtoDAsync ¶
func (*Ctx) MemcpyDtoH ¶
func (*Ctx) MemcpyDtoHAsync ¶
func (*Ctx) MemcpyHtoA ¶
func (*Ctx) MemcpyHtoAAsync ¶
func (*Ctx) MemcpyHtoD ¶
func (*Ctx) MemcpyHtoDAsync ¶
func (*Ctx) MemcpyPeer ¶
func (*Ctx) MemcpyPeerAsync ¶
func (*Ctx) MemsetD16Async ¶
func (*Ctx) MemsetD2D16 ¶
func (*Ctx) MemsetD2D16Async ¶
func (*Ctx) MemsetD2D32 ¶
func (*Ctx) MemsetD2D32Async ¶
func (*Ctx) MemsetD2D8 ¶
func (*Ctx) MemsetD2D8Async ¶
func (*Ctx) MemsetD32Async ¶
func (*Ctx) MemsetD8Async ¶
func (*Ctx) ModuleFunction ¶
func (*Ctx) ModuleGlobal ¶
func (*Ctx) ModuleSurfRef ¶ added in v0.9.1
func (*Ctx) ModuleTexRef ¶ added in v0.9.1
func (*Ctx) QueryEvent ¶
func (*Ctx) QueryStream ¶
func (*Ctx) ResetL2Cache ¶ added in v0.9.4
func (ctx *Ctx) ResetL2Cache()
func (*Ctx) Run ¶
Run locks the goroutine to the OS thread and ties the CUDA context to the OS thread. For most cases, this would suffice
Note: errChan that is passed in should NOT be the same errChan as the one used internally for signalling. The main reasoning for passing in an error channel is to support two different kinds of run modes:
The typical use example is as such:
func A() { ctx := NewContext(d, SchedAuto) errChan := make(chan error) go ctx.Run(errChan) if err := <- errChan; err != nil { // handleError } doSomethingWithCtx(ctx) }
And yet another run mode supported is running of the context in the main thread:
func main() { ctx := NewContext(d, SchedAuto) go doSomethingWithCtx(ctx) if err := ctx.Run(nil); err != nil{ // handle error } }
func (*Ctx) SetAddress ¶
func (*Ctx) SetAddress2D ¶
func (*Ctx) SetAddressMode ¶
func (ctx *Ctx) SetAddressMode(hTexRef TexRef, dim int, am AddressMode)
func (*Ctx) SetBorderColor ¶
func (*Ctx) SetCacheConfig ¶
func (ctx *Ctx) SetCacheConfig(fn Function, config FuncCacheConfig)
func (*Ctx) SetCurrentCacheConfig ¶
func (ctx *Ctx) SetCurrentCacheConfig(config FuncCacheConfig)
func (*Ctx) SetFilterMode ¶
func (ctx *Ctx) SetFilterMode(hTexRef TexRef, fm FilterMode)
func (*Ctx) SetFunctionSharedMemConfig ¶
func (ctx *Ctx) SetFunctionSharedMemConfig(fn Function, config SharedConfig)
func (*Ctx) SetMaxAnisotropy ¶
func (*Ctx) SetMipmapFilterMode ¶
func (ctx *Ctx) SetMipmapFilterMode(hTexRef TexRef, fm FilterMode)
func (*Ctx) SetMipmapLevelBias ¶
func (*Ctx) SetMipmapLevelClamp ¶
func (*Ctx) SetSharedMemConfig ¶
func (ctx *Ctx) SetSharedMemConfig(config SharedConfig)
func (*Ctx) SetTexRefFlags ¶
func (ctx *Ctx) SetTexRefFlags(hTexRef TexRef, Flags TexRefFlags)
func (*Ctx) SharedMemConfig ¶
func (ctx *Ctx) SharedMemConfig() (pConfig SharedConfig, err error)
func (*Ctx) StreamPriorityRange ¶
func (*Ctx) SurfRefSetArray ¶
func (*Ctx) Synchronize ¶
func (ctx *Ctx) Synchronize()
func (*Ctx) SynchronizeEvent ¶
func (*Ctx) SynchronizeStream ¶
func (*Ctx) TexRefSetArray ¶
func (*Ctx) WaitOnValue32 ¶
type Device ¶
type Device int
Device is the representation of a CUDA device
func CurrentDevice ¶
func (Device) Attributes ¶
func (dev Device) Attributes(attrs ...DeviceAttribute) ([]int, error)
Attributes gets multiple attributes as provided
func (Device) CanAccessPeer ¶
func (Device) ComputeCapability ¶
ComputeCapability returns the compute capability of the device. This method is a convenience method for the deprecated API call cuDeviceComputeCapability.
func (Device) MakeContext ¶
func (d Device) MakeContext(flags ContextFlags) (CUContext, error)
func (Device) Name ¶
Name returns the name of the device.
Wrapper over cuDeviceGetName: http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1gef75aa30df95446a845f2a7b9fffbb7f
func (Device) P2PAttribute ¶
func (srcDevice Device) P2PAttribute(attrib P2PAttribute, dstDevice Device) (value int, err error)
func (Device) PrimaryCtxState ¶
func (dev Device) PrimaryCtxState() (flags ContextFlags, active int, err error)
func (Device) ReleasePrimaryCtx ¶
func (Device) ResetPrimaryCtx ¶
func (Device) RetainPrimaryCtx ¶
RetainPrimaryCtx retains the primary context on the GPU, creating it if necessary, increasing its usage count.
The caller must call d.ReleasePrimaryCtx() when done using the context. Unlike MakeContext() the newly created context is not pushed onto the stack.
Context creation will fail with error `UnknownError` if the compute mode of the device is CU_COMPUTEMODE_PROHIBITED. The function cuDeviceGetAttribute() can be used with CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode of the device. The nvidia-smi tool can be used to set the compute mode for devices. Documentation for nvidia-smi can be obtained by passing a -h option to it. Please note that the primary context always supports pinned allocations. Other flags can be specified by cuDevicePrimaryCtxSetFlags().
func (Device) SetPrimaryCtxFlags ¶
func (dev Device) SetPrimaryCtxFlags(flags ContextFlags) (err error)
type DeviceAttribute ¶
type DeviceAttribute int
DeviceAttribute represents the device attributes that the user can query CUDA for.
const ( MaxThreadsPerBlock DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block MaxBlockDimX DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X MaxBlockDimY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y MaxBlockDimZ DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z MaxGridDimX DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X MaxGridDimY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y MaxGridDimZ DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z TotalConstantMemory DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes WarpSize DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads MaxPitch DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies MaxRegistersPerBlock DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block RegistersPerBlock DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK // Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK ClockRate DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Typical clock frequency in kilohertz TextureAlignment DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures GpuOverlap DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP // Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. MultiprocessorCount DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device KernelExecTimeout DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels Integrated DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory CanMapHostMemory DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space ComputeMode DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See CUcomputemode for details) MaximumTexture1dWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width MaximumTexture2dWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width MaximumTexture2dHeight DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height MaximumTexture3dWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width MaximumTexture3dHeight DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height MaximumTexture3dDepth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth MaximumTexture2dLayeredWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width MaximumTexture2dLayeredHeight DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height MaximumTexture2dLayeredLayers DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture MaximumTexture2dArrayWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH MaximumTexture2dArrayHeight DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT MaximumTexture2dArrayNumslices DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS SurfaceAlignment DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces ConcurrentKernels DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently EccEnabled DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled PciBusID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device PciDeviceID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device TccDriver DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model MemoryClockRate DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz GlobalMemoryBusWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits L2CacheSize DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes MaxThreadsPerMultiprocessor DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor AsyncEngineCount DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines UnifiedAddressing DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device shares a unified address space with the host MaximumTexture1dLayeredWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width MaximumTexture1dLayeredLayers DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture CanTex2dGather DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER // Deprecated, do not use. MaximumTexture2dGatherWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH // Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set MaximumTexture2dGatherHeight DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT // Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set MaximumTexture3dWidthAlternate DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE // Alternate maximum 3D texture width MaximumTexture3dHeightAlternate DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE // Alternate maximum 3D texture height MaximumTexture3dDepthAlternate DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE // Alternate maximum 3D texture depth PciDomainID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID // PCI domain ID of the device TexturePitchAlignment DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT // Pitch alignment requirement for textures MaximumTexturecubemapWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH // Maximum cubemap texture width/height MaximumTexturecubemapLayeredWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH // Maximum cubemap layered texture width/height MaximumTexturecubemapLayeredLayers DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS // Maximum layers in a cubemap layered texture MaximumSurface1dWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH // Maximum 1D surface width MaximumSurface2dWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH // Maximum 2D surface width MaximumSurface2dHeight DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT // Maximum 2D surface height MaximumSurface3dWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH // Maximum 3D surface width MaximumSurface3dHeight DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT // Maximum 3D surface height MaximumSurface3dDepth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH // Maximum 3D surface depth MaximumSurface1dLayeredWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH // Maximum 1D layered surface width MaximumSurface1dLayeredLayers DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS // Maximum layers in a 1D layered surface MaximumSurface2dLayeredWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH // Maximum 2D layered surface width MaximumSurface2dLayeredHeight DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT // Maximum 2D layered surface height MaximumSurface2dLayeredLayers DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS // Maximum layers in a 2D layered surface MaximumSurfacecubemapWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH // Maximum cubemap surface width MaximumSurfacecubemapLayeredWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH // Maximum cubemap layered surface width MaximumSurfacecubemapLayeredLayers DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS // Maximum layers in a cubemap layered surface MaximumTexture1dLinearWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH // Maximum 1D linear texture width MaximumTexture2dLinearWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH // Maximum 2D linear texture width MaximumTexture2dLinearHeight DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT // Maximum 2D linear texture height MaximumTexture2dLinearPitch DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH // Maximum 2D linear texture pitch in bytes MaximumTexture2dMipmappedWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH // Maximum mipmapped 2D texture width MaximumTexture2dMipmappedHeight DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT // Maximum mipmapped 2D texture height ComputeCapabilityMajor DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR // Major compute capability version number ComputeCapabilityMinor DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR // Minor compute capability version number MaximumTexture1dMipmappedWidth DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH // Maximum mipmapped 1D texture width StreamPrioritiesSupported DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED // Device supports stream priorities GlobalL1CacheSupported DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED // Device supports caching globals in L1 LocalL1CacheSupported DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED // Device supports caching locals in L1 MaxRegistersPerMultiprocessor DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR // Maximum number of 32-bit registers available per multiprocessor ManagedMemory DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY // Device can allocate managed memory on this system MultiGpuBoard DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD // Device is on a multi-GPU board MultiGpuBoardGroupID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID // Unique id for a group of devices on the same multi-GPU board HostNativeAtomicSupported DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED // Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware) SingleToDoublePrecisionPerfRatio DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO // Ratio of single precision performance (in floating-point operations per second) to double precision performance PageableMemoryAccess DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS // Device supports coherently accessing pageable memory without calling cudaHostRegister on it ConcurrentManagedAccess DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS // Device can coherently access managed memory concurrently with the CPU ComputePreemptionSupported DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED // Device supports compute preemption. CanUseHostPointerForRegisteredMem DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM // Device can access host registered memory at the same virtual address as the CPU )
type DevicePtr ¶
type DevicePtr uintptr
DevicePtr is a pointer to the device memory. It is equivalent to CUDA's CUdeviceptr
func AllocAndCopy ¶
AllocAndCopy abstracts away the common pattern of allocating and then copying a Go slice to the GPU
func MemAllocManaged ¶
func MemAllocManaged(bytesize int64, flags MemAttachFlags) (dptr DevicePtr, err error)
func MemAllocPitch ¶
func (DevicePtr) AddressRange ¶
func (DevicePtr) IsCUDAMemory ¶ added in v0.9.1
IsCUDAMemory returns true.
func (DevicePtr) MemAdvise ¶
MemAdvise advises the Unified Memory subsystem about the usage pattern for the memory range starting at d with a size of count bytes. The start address and end address of the memory range will be rounded down and rounded up respectively to be aligned to CPU page size before the advice is applied. The memory range must refer to managed memory allocated via `MemAllocManaged` or declared via __managed__ variables.
The advice parameters can take either of the following values:
- SetReadMostly: This implies that the data is mostly going to be read from and only occasionally written to. Any read accesses from any processor to this region will create a read-only copy of at least the accessed pages in that processor's memory. Additionally, if cuMemPrefetchAsync is called on this region, it will create a read-only copy of the data on the destination processor. If any processor writes to this region, all copies of the corresponding page will be invalidated except for the one where the write occurred. The device argument is ignored for this advice. Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU that has a non-zero value for the device attribute CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Also, if a context is created on a device that does not have the device attribute CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until all such contexts are destroyed.
- UnsetReadMostly: Undoes the effect of SetReadMostly and also prevents the Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated copies of the data will be collapsed into a single copy. The location for the collapsed copy will be the preferred location if the page has a preferred location and one of the read-duplicated copies was resident at that location. Otherwise, the location chosen is arbitrary.
- SetPreferredLocation: This advice sets the preferred location for the data to be the memory belonging to device. Passing in CU_DEVICE_CPU for device sets the preferred location as host memory. If device is a GPU, then it must have a non-zero value for the device attribute CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location does not cause data to migrate to that location immediately. Instead, it guides the migration policy when a fault occurs on that memory region. If the data is already in its preferred location and the faulting processor can establish a mapping without requiring the data to be migrated, then data migration will be avoided. On the other hand, if the data is not in its preferred location or if a direct mapping cannot be established, then it will be migrated to the processor accessing it. It is important to note that setting the preferred location does not prevent data prefetching done using cuMemPrefetchAsync. Having a preferred location can override the page thrash detection and resolution logic in the Unified Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device memory, the page may eventually be pinned to host memory by the Unified Memory driver. But if the preferred location is set as device memory, then the page will continue to thrash indefinitely. If CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the policies associated with that advice will override the policies of this advice.
- UnsetPreferredLocation: Undoes the effect of SetPreferredLocation and changes the preferred location to none.
- SetAccessedBy: This advice implies that the data will be accessed by device. Passing in CU_DEVICE_CPU for device will set the advice for the CPU. If device is a GPU, then the device attribute CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. This advice does not cause data migration and has no impact on the location of the data per se. Instead, it causes the data to always be mapped in the specified processor's page tables, as long as the location of the data permits a mapping to be established. If the data gets migrated for any reason, the mappings are updated accordingly. This advice is recommended in scenarios where data locality is not important, but avoiding faults is. Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data over to the other GPUs is not as important because the accesses are infrequent and the overhead of migration may be too high. But preventing faults can still help improve performance, and so having a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated to host memory because the CPU typically cannot access device memory directly. Any GPU that had the CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the page in host memory. If CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the policies associated with that advice will override the policies of this advice. Additionally, if the preferred location of this memory region or any subset of it is also device, then the policies associated with CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
- UnsetAccessedBy: Undoes the effect of SetAccessedBy. Any mappings to the data from device may be removed at any time causing accesses to result in non-fatal page faults.
func (DevicePtr) MemPrefetchAsync ¶
MemPrefetchAsync prefetches memory to the specified destination device. devPtr is the base device pointer of the memory to be prefetched and dstDevice is the destination device. count specifies the number of bytes to copy. hStream is the stream in which the operation is enqueued. The memory range must refer to managed memory allocated via cuMemAllocManaged or declared via __managed__ variables. Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to host memory. If dstDevice is a GPU, then the device attribute CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. Additionally, hStream must be associated with a device that has a non-zero value for the device attribute CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
The start address and end address of the memory range will be rounded down and rounded up respectively to be aligned to CPU page size before the prefetch operation is enqueued in the stream.
If no physical memory has been allocated for this region, then this memory region will be populated and mapped on the destination device. If there's insufficient memory to prefetch the desired region, the Unified Memory driver may evict pages from other cuMemAllocManaged allocations to host memory in order to make room. Device memory allocated using cuMemAlloc or cuArrayCreate will not be evicted.
By default, any mappings to the previous location of the migrated pages are removed and mappings for the new location are only setup on dstDevice. The exact behavior however also depends on the settings applied to this memory range via cuMemAdvise as described below:
If CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, then that subset will create a read-only copy of the pages on dstDevice.
If CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory range, then the pages will be migrated to dstDevice even if dstDevice is not the preferred location of any pages in the memory range.
If CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, then mappings to those pages from all the appropriate processors are updated to refer to the new location if establishing such a mapping is possible. Otherwise, those mappings are cleared.
Note that this API is not required for functionality and only serves to improve performance by allowing the application to migrate data to a suitable location before it is accessed. Memory accesses to this range are always coherent and are allowed even when the data is actively being migrated.
Note that this function is asynchronous with respect to the host and all work on other devices.
func (DevicePtr) MemSize ¶
MemSize returns the size of the memory slab in bytes. Returns 0 if errors occured
func (DevicePtr) MemoryType ¶
func (mem DevicePtr) MemoryType() (typ MemoryType, err error)
MemoryType returns the MemoryType of the memory
func (DevicePtr) PtrAttribute ¶
func (d DevicePtr) PtrAttribute(attr PointerAttribute) (unsafe.Pointer, error)
PtrAttribute returns information about a pointer.
func (DevicePtr) SetPtrAttribute ¶
func (d DevicePtr) SetPtrAttribute(value unsafe.Pointer, attr PointerAttribute) error
SetPtrAttribute sets attributes on a previously allocated memory region. The supported attributes are:
SynncMemOpsAttr: A boolean attribute that can either be set (1) or unset (0). When set, the region of memory that ptr points to is guaranteed to always synchronize memory operations that are synchronous. If there are some previously initiated synchronous memory operations that are pending when this attribute is set, the function does not return until those memory operations are complete. See further documentation in the section titled "API synchronization behavior" to learn more about cases when synchronous memory operations can exhibit asynchronous behavior. `value` will be considered as a pointer to an unsigned integer to which this attribute is to be set.
type ErrorLister ¶
type ErrorLister interface {
ListErrors() []error
}
ErrorLister is the interface for a slice of error
type Event ¶
type Event struct {
// contains filtered or unexported fields
}
Event represents a CUDA event
func MakeEvent ¶
func MakeEvent(flags EventFlags) (event Event, err error)
func (Event) Synchronize ¶
type EventFlags ¶
type EventFlags byte
EventFlags are flags to be used with event creation
const ( DefaultEvent EventFlags = C.CU_EVENT_DEFAULT // Default event flag BlockingSyncEvent EventFlags = C.CU_EVENT_BLOCKING_SYNC // Event uses blocking synchronization DisableTiming EventFlags = C.CU_EVENT_DISABLE_TIMING // Event will not record timing data InterprocessEvent EventFlags = C.CU_EVENT_INTERPROCESS // Event is suitable for interprocess use. DisableTiming must be set )
type ExecGraph ¶ added in v0.9.4
type ExecGraph struct {
// contains filtered or unexported fields
}
ExecGraph represents a CUDA execution graph.
type FilterMode ¶
type FilterMode byte
FilterMode are texture reference filtering modes
const ( PointFilterMode FilterMode = C.CU_TR_FILTER_MODE_POINT // Point filter mode LinearFilterMode FilterMode = C.CU_TR_FILTER_MODE_LINEAR // Linear filter mode )
type Format ¶
type Format byte
Format is the type of array (think array types)
const ( Uint8 Format = C.CU_AD_FORMAT_UNSIGNED_INT8 // Unsigned 8-bit integers Uint16 Format = C.CU_AD_FORMAT_UNSIGNED_INT16 // Unsigned 16-bit integers Uin32 Format = C.CU_AD_FORMAT_UNSIGNED_INT32 // Unsigned 32-bit integers Int8 Format = C.CU_AD_FORMAT_SIGNED_INT8 // Signed 8-bit integers Int16 Format = C.CU_AD_FORMAT_SIGNED_INT16 // Signed 16-bit integers Int32 Format = C.CU_AD_FORMAT_SIGNED_INT32 // Signed 32-bit integers Float16 Format = C.CU_AD_FORMAT_HALF // 16-bit floating point Float32 Format = C.CU_AD_FORMAT_FLOAT // 32-bit floating point )
type FuncCacheConfig ¶
type FuncCacheConfig byte
FuncCacheConfig represents the CUfunc_cache enum type, which are enumerations for cache configurations
const ( PreferNone FuncCacheConfig = C.CU_FUNC_CACHE_PREFER_NONE // no preference for shared memory or L1 (default) PreferL1 FuncCacheConfig = C.CU_FUNC_CACHE_PREFER_L1 // prefer larger L1 cache and smaller shared memory PreferEqual FuncCacheConfig = C.CU_FUNC_CACHE_PREFER_EQUAL // prefer equal sized L1 cache and shared memory )
func CurrentCacheConfig ¶
func CurrentCacheConfig() (pconfig FuncCacheConfig, err error)
type Function ¶
type Function struct {
// contains filtered or unexported fields
}
Function represents a CUDA function
func (Function) Attribute ¶
func (fn Function) Attribute(attrib FunctionAttribute) (pi int, err error)
func (Function) Launch ¶ added in v0.9.1
func (fn Function) Launch(gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) error
Launch launches a CUDA function
func (Function) LaunchAndSync ¶
func (fn Function) LaunchAndSync(gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) error
LaunchAndSync launches the kernel and synchronizes the context
func (Function) MaxActiveBlocksPerMultiProcessor ¶
func (fn Function) MaxActiveBlocksPerMultiProcessor(blockSize int, dynamicSmemSize int64) (int, error)
MaxActiveBlocksPerMultiProcessor returns the number of the maximum active blocks per streaming multiprocessor.
func (Function) MaxActiveBlocksPerMultiProcessorWithFlags ¶
func (fn Function) MaxActiveBlocksPerMultiProcessorWithFlags(blockSize int, dynamicSmemSize int64, flags OccupancyFlags) (int, error)
MaxActiveBlocksPerMultiProcessorWithFlags returns the number of the maximum active blocks per streaming multiprocessor. The flags control how special cases are handled.
func (Function) SetCacheConfig ¶
func (fn Function) SetCacheConfig(config FuncCacheConfig) (err error)
func (Function) SetSharedMemConfig ¶
func (fn Function) SetSharedMemConfig(config SharedConfig) (err error)
type FunctionAttribute ¶
type FunctionAttribute int
FunctionAttribute is a representation of the properties of a function
const ( FnMaxThreadsPerBlock FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. This number depends on both the function and the device on which the function is currently loaded. ConstSizeBytes FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. LocalSizeBytes FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. NumRegs FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. PtxVersion FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. This value is the major PTX version * 10 + the minor PTX version, so a PTX version 1.3 function would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA 3.0. BinaryVersion FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. This value is the major binary version * 10 + the minor binary version, so a binary version 1.3 function would return the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary architecture version. CacheModeCa FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA // The attribute to indicate whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set . )
type Graph ¶ added in v0.9.4
type Graph struct {
// contains filtered or unexported fields
}
Graph represents a CUDA graph
func (Graph) AddDependencies ¶ added in v0.9.4
AddDependencies adds edges to the graph. Both `from` and `to` must be the same length. An edge will be added from from[i] to to[i] If an edge already exists between the nodes, then an error will be returned
func (Graph) AddEmptyNode ¶ added in v0.9.4
AddEmptyNode creates an empty node and edds it to the graph. An empty node is a node that performs no operations during execution. It can be used for transitive ordering.
For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.
func (Graph) AddHostNode ¶ added in v0.9.4
func (g Graph) AddHostNode(children []Node, params *HostNodeParams) (Node, error)
AddHostNode creates a host execution node and adds it to the graph. When the graph is launched, the node will invoke the specified CPU function. Host nodes are not supported under MPS with pre-Volta GPUs.
func (Graph) AddKernelNode ¶ added in v0.9.4
func (g Graph) AddKernelNode(children []Node, params *KernelNodeParams) (Node, error)
AddKernelNode creates a kernel execution node and adds it to the graph. When the graph is launched, the node will invoke the specified kernel function.
func (Graph) AddMemcpyNode ¶ added in v0.9.4
AddMemcpyNode creates a node which performs memcpy.
func (Graph) AddMemsetNode ¶ added in v0.9.4
func (Graph) Edges ¶ added in v0.9.4
Edges returns the edges between nodes. CUDA's API is quite dodgy and unclear. It is reproduced below:
Returns a list of hGraph's dependency edges. Edges are returned via corresponding indices in from and to; that is, the node in to[i] has a dependency on the node in from[i]. from and to may both be NULL, in which case this function only returns the number of edges in numEdges. Otherwise, numEdges entries will be filled in. If numEdges is higher than the actual number of edges, the remaining entries in from and to will be set to NULL, and the number of edges actually returned will be written to numEdges.
type HostFunction ¶ added in v0.9.4
type HostFunction func()
HostFunction is a closure of a function call with its data.
type HostNodeParams ¶ added in v0.9.4
type HostNodeParams struct { Func HostFunction Data unsafe.Pointer // contains filtered or unexported fields }
HostNodeParams are parameters passed in to a node that will call a host function (i.e. a function written in Go)
type JITCacheMode ¶ added in v0.9.1
type JITCacheMode struct{ Value JITCacheModeOption }
Specifies whether to enable caching explicitly (-dlcm)
type JITCacheModeOption ¶ added in v0.9.1
type JITCacheModeOption uint64
Caching modes for dlcm
const ( // Compile with no -dlcm flag specified JITCacheNone JITCacheModeOption = C.CU_JIT_CACHE_OPTION_NONE // Compile with L1 cache disabled JITCacheCG JITCacheModeOption = C.CU_JIT_CACHE_OPTION_CG // Compile with L1 cache enabled JITCacheCA JITCacheModeOption = C.CU_JIT_CACHE_OPTION_CA )
type JITErrorLogBuffer ¶ added in v0.9.1
type JITErrorLogBuffer struct{ Buffer []byte }
Buffer in which to print any log messages that reflect errors
type JITFallbackOption ¶ added in v0.9.1
type JITFallbackOption uint64
Cubin matching fallback strategies
const ( // Prefer to compile ptx if exact binary match not found JITPreferPTX JITFallbackOption = C.CU_PREFER_PTX // Prefer to fall back to compatible binary code if exact match not found JITPreferBinary JITFallbackOption = C.CU_PREFER_BINARY )
type JITFallbackStrategy ¶ added in v0.9.1
type JITFallbackStrategy struct{ Value JITFallbackOption }
Specifies choice of fallback strategy if matching cubin is not found.
type JITGenerateDebugInfo ¶ added in v0.9.1
type JITGenerateDebugInfo struct{ Enabled bool }
Specifies whether to create debug information in output (-g)
type JITGenerateLineInfo ¶ added in v0.9.1
type JITGenerateLineInfo struct{ Enabled bool }
Generate line number information (-lineinfo)
type JITInfoLogBuffer ¶ added in v0.9.1
type JITInfoLogBuffer struct{ Buffer []byte }
Buffer in which to print any log messages that are informational in nature.
type JITInputType ¶ added in v0.9.1
type JITInputType uint64
const ( // Compiled device-class-specific device code JITInputCUBIN JITInputType = C.CU_JIT_INPUT_CUBIN // PTX source code JITInputPTX JITInputType = C.CU_JIT_INPUT_PTX // Bundle of multiple cubins and/or PTX of some device code JITInputFatBinary JITInputType = C.CU_JIT_INPUT_FATBINARY // Host object with embedded device code JITInputObject JITInputType = C.CU_JIT_INPUT_OBJECT // Archive of host objects with embedded device code JITInputLibrary JITInputType = C.CU_JIT_INPUT_LIBRARY )
type JITLogVerbose ¶ added in v0.9.1
type JITLogVerbose struct{ Enabled bool }
Generate verbose log messages (-v)
type JITMaxRegisters ¶ added in v0.9.1
type JITMaxRegisters struct{ Value uint }
Max number of registers that a thread may use.
type JITOptimizationLevel ¶ added in v0.9.1
type JITOptimizationLevel struct{ Value uint }
Level of optimizations to apply to generated code (0 - 4)
type JITOption ¶ added in v0.9.1
type JITOption interface {
// contains filtered or unexported methods
}
type JITTarget ¶ added in v0.9.1
type JITTarget struct{ Value JITTargetOption }
Target is chosen based on supplied Value
type JITTargetFromContext ¶ added in v0.9.1
type JITTargetFromContext struct{}
Determines the target based on the current attached context (default)
type JITTargetOption ¶ added in v0.9.1
type JITTargetOption uint64
const ( // JITTarget10 JITTargetOption = C.CU_TARGET_COMPUTE_10 // JITTarget11 JITTargetOption = C.CU_TARGET_COMPUTE_11 // JITTarget12 JITTargetOption = C.CU_TARGET_COMPUTE_12 // JITTarget13 JITTargetOption = C.CU_TARGET_COMPUTE_13 // JITTarget20 JITTargetOption = C.CU_TARGET_COMPUTE_20 // JITTarget21 JITTargetOption = C.CU_TARGET_COMPUTE_21 JITTarget30 JITTargetOption = C.CU_TARGET_COMPUTE_30 JITTarget32 JITTargetOption = C.CU_TARGET_COMPUTE_32 JITTarget35 JITTargetOption = C.CU_TARGET_COMPUTE_35 JITTarget37 JITTargetOption = C.CU_TARGET_COMPUTE_37 JITTarget50 JITTargetOption = C.CU_TARGET_COMPUTE_50 JITTarget52 JITTargetOption = C.CU_TARGET_COMPUTE_52 JITTarget53 JITTargetOption = C.CU_TARGET_COMPUTE_53 JITTarget60 JITTargetOption = C.CU_TARGET_COMPUTE_60 JITTarget61 JITTargetOption = C.CU_TARGET_COMPUTE_61 JITTarget62 JITTargetOption = C.CU_TARGET_COMPUTE_62 )
type JITThreadsPerBlock ¶ added in v0.9.1
type JITThreadsPerBlock struct{ Value uint }
Specifies minimum number of threads per block to target compilation
type JITWallTime ¶ added in v0.9.1
type JITWallTime struct{ Result float32 }
Overwrites the option value with the total wall clock time, in milliseconds, spent in the compiler and linker.
type KernelNodeParams ¶ added in v0.9.4
type KernelNodeParams struct { Func Function GridDimX uint GridDimY uint GridDimZ uint BlockDimX uint BlockDimY uint BlockDimZ uint Params []*KernelNodeParams }
KernelNodeParams represents the parameters to launch a kernel in a graph node.
type Limit ¶
type Limit byte
Limit is a flag that can be used to query and set on a context
const ( StackSize Limit = C.CU_LIMIT_STACK_SIZE // GPU thread stack size PrintfFIFOSize Limit = C.CU_LIMIT_PRINTF_FIFO_SIZE // GPU printf FIFO size MallocHeapSize Limit = C.CU_LIMIT_MALLOC_HEAP_SIZE // GPU malloc heap size DevRuntimeSyncDepth Limit = C.CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH // GPU device runtime launch synchronize depth DevRuntimePendingLaunchCount Limit = C.CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT // GPU device runtime pending launch count )
type LinkState ¶ added in v0.9.1
type LinkState struct {
// contains filtered or unexported fields
}
func (*LinkState) AddData ¶ added in v0.9.1
func (link *LinkState) AddData(input JITInputType, data string, name string, options ...JITOption) error
Add an input to a pending linker invocation
func (*LinkState) AddFile ¶ added in v0.9.1
func (link *LinkState) AddFile(input JITInputType, path string, options ...JITOption) error
Add a file input to a pending linker invocation
type MemAdvice ¶
type MemAdvice byte
MemAdvice is a flag that advises the device on memory usage
const ( SetReadMostly MemAdvice = C.CU_MEM_ADVISE_SET_READ_MOSTLY // Data will mostly be read and only occassionally be written to UnsetReadMostly MemAdvice = C.CU_MEM_ADVISE_UNSET_READ_MOSTLY // Undo the effect of CU_MEM_ADVISE_SET_READ_MOSTLY SetPreferredLocation MemAdvice = C.CU_MEM_ADVISE_SET_PREFERRED_LOCATION // Set the preferred location for the data as the specified device UnsetPreferredLocation MemAdvice = C.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION // Clear the preferred location for the data SetAccessedBy MemAdvice = C.CU_MEM_ADVISE_SET_ACCESSED_BY // Data will be accessed by the specified device, so prevent page faults as much as possible UnsetAccessedBy MemAdvice = C.CU_MEM_ADVISE_UNSET_ACCESSED_BY //Let the Unified Memory subsystem decide on the page faulting policy for the specified device )
type MemAttachFlags ¶
type MemAttachFlags byte
MemAttachFlags are flags for memory attachment (used in allocating memory)
const ( AttachGlobal MemAttachFlags = C.CU_MEM_ATTACH_GLOBAL // Memory can be accessed by any stream on any device AttachHost MemAttachFlags = C.CU_MEM_ATTACH_HOST // Memory cannot be accessed by any stream on any device AttachSingle MemAttachFlags = C.CU_MEM_ATTACH_SINGLE // Memory can only be accessed by a single stream on the associated device )
type Memcpy2dParam ¶
type Memcpy2dParam struct { Height int64 WidthInBytes int64 DstArray Array DstDevice DevicePtr DstHost unsafe.Pointer DstMemoryType MemoryType DstPitch int64 DstXInBytes int64 DstY int64 SrcArray Array SrcDevice DevicePtr SrcHost unsafe.Pointer SrcMemoryType MemoryType SrcPitch int64 SrcXInBytes int64 SrcY int64 }
Memcpy2dParam is a struct representing the params of a 2D memory copy instruction. To aid usability, the fields are ordered as per the documentation (the actual struct is laid out differently).
type Memcpy3dParam ¶
type Memcpy3dParam struct { Depth int64 Height int64 WidthInBytes int64 DstArray Array DstDevice DevicePtr DstHeight int64 DstHost unsafe.Pointer DstLOD int64 DstMemoryType MemoryType DstPitch int64 DstXInBytes int64 DstY int64 DstZ int64 SrcArray Array SrcDevice DevicePtr SrcHeight int64 SrcHost unsafe.Pointer SrcLOD int64 SrcMemoryType MemoryType SrcPitch int64 SrcXInBytes int64 SrcY int64 SrcZ int64 }
Memcpy3dParam is a struct representing the params of a 3D memory copy instruction. To aid usability, the fields are ordered as per the documentation (the actual struct is laid out differently).
type Memcpy3dPeerParam ¶
type Memcpy3dPeerParam struct { Depth int64 Height int64 WidthInBytes int64 DstArray Array DstContext CUContext DstDevice DevicePtr DstHeight int64 DstHost unsafe.Pointer DstLOD int64 DstMemoryType MemoryType DstPitch int64 DstXInBytes int64 DstY int64 DstZ int64 SrcArray Array SrcContext CUContext SrcDevice DevicePtr SrcHeight int64 SrcHost unsafe.Pointer SrcLOD int64 SrcMemoryType MemoryType SrcPitch int64 SrcXInBytes int64 SrcY int64 SrcZ int64 }
Memcpy3dParam is a struct representing the params of a 3D memory copy instruction across contexts. To aid usability, the fields are ordered as per the documentation (the actual struct is laid out differently).
type MemoryType ¶
type MemoryType byte
MemoryType is a representation of the memory types of the device pointer
const ( HostMemory MemoryType = C.CU_MEMORYTYPE_HOST // Host memory DeviceMemory MemoryType = C.CU_MEMORYTYPE_DEVICE // Device memory ArrayMemory MemoryType = C.CU_MEMORYTYPE_ARRAY // Array memory UnifiedMemory MemoryType = C.CU_MEMORYTYPE_UNIFIED // Unified device or host memory )
type MemsetParams ¶ added in v0.9.4
type Module ¶
type Module struct {
// contains filtered or unexported fields
}
Module represents a CUDA Module
func Load ¶
Load loads a module into the current context. The CUDA driver API does not attempt to lazily allocate the resources needed by a module; if the memory for functions and data (constant and global) needed by the module cannot be allocated, `Load()` fails.
The file should be a cubin file as output by nvcc, or a PTX file either as output by nvcc or handwritten, or a fatbin file as output by nvcc from toolchain 4.0 or late
func LoadDataEx ¶ added in v0.9.1
LoadDataEx loads a module from a input string.
func LoadFatBinary ¶ added in v0.9.1
LoadFatBinary loads a module from a input string.
func (Module) Function ¶
Function returns a pointer to the function in the module by the name. If it's not found, the error NotFound is returned
type Node ¶ added in v0.9.4
type Node struct {
// contains filtered or unexported fields
}
Node represents a CUDA graph node
type OccupancyFlags ¶
type OccupancyFlags byte
OccupanyFlags represents the flags to the occupancy calculator
const ( DefaultOccupancy OccupancyFlags = C.CU_OCCUPANCY_DEFAULT // Default behavior DisableCachingOverride OccupancyFlags = C.CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE // Assume global caching is enabled and cannot be automatically turned off )
type P2PAttribute ¶
type P2PAttribute byte
P2PAttribute is a representation of P2P attributes
const ( PerformanceRank P2PAttribute = C.CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK // A relative value indicating the performance of the link between two devices P2PAccessSupported P2PAttribute = C.CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED // P2P Access is enabled P2PNativeAomicSupported P2PAttribute = C.CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED // Atomic operation over the link is supported )
type PointerAttribute ¶
type PointerAttribute int
PointerAttribute is a representation of the metadata of pointers
const ( ContextAttr PointerAttribute = C.CU_POINTER_ATTRIBUTE_CONTEXT // The CUcontext on which a pointer was allocated or registered MemoryTypeAttr PointerAttribute = C.CU_POINTER_ATTRIBUTE_MEMORY_TYPE // The CUmemorytype describing the physical location of a pointer DevicePointerAttr PointerAttribute = C.CU_POINTER_ATTRIBUTE_DEVICE_POINTER // The address at which a pointer's memory may be accessed on the device HostPointerAttr PointerAttribute = C.CU_POINTER_ATTRIBUTE_HOST_POINTER // The address at which a pointer's memory may be accessed on the host P2PTokenAttr PointerAttribute = C.CU_POINTER_ATTRIBUTE_P2P_TOKENS // A pair of tokens for use with the nv-p2p.h Linux kernel interface SymcMemopsAttr PointerAttribute = C.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS // Synchronize every synchronous memory operation initiated on this region BufferIDAttr PointerAttribute = C.CU_POINTER_ATTRIBUTE_BUFFER_ID // A process-wide unique ID for an allocated memory region IsManagedAttr PointerAttribute = C.CU_POINTER_ATTRIBUTE_IS_MANAGED // Indicates if the pointer points to managed memory )
type SharedConfig ¶
type SharedConfig byte
ShareConfigs are flags for shared memory configurations
const ( DefaultBankSize SharedConfig = C.CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE // set default shared memory bank size FourByteBankSize SharedConfig = C.CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE // set shared memory bank width to four bytes EightByteBankSize SharedConfig = C.CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE // set shared memory bank width to eight bytes )
func SharedMemConfig ¶
func SharedMemConfig() (pConfig SharedConfig, err error)
type Stream ¶
type Stream struct {
// contains filtered or unexported fields
}
Stream represents a CUDA stream.
func MakeStream ¶
func MakeStream(flags StreamFlags) (Stream, error)
MakeStream creates a stream. The flags determines the behaviors of the stream.
func MakeStreamWithPriority ¶
func MakeStreamWithPriority(priority int, flags StreamFlags) (Stream, error)
MakeStreamWithPriority creates a stream with the given priority. The flags determines the behaviors of the stream. This API alters the scheduler priority of work in the stream. Work in a higher priority stream may preempt work already executing in a low priority stream.
`priority` follows a convention where lower numbers represent higher priorities. '0' represents default priority.
The range of meaningful numerical priorities can be queried using `StreamPriorityRange`. If the specified priority is outside the numerical range returned by `StreamPriorityRange`, it will automatically be clamped to the lowest or the highest number in the range.
func (Stream) AttachMemAsync ¶
func (*Stream) Destroy ¶ added in v0.9.1
DestroyStream destroys the stream specified by hStream.
In case the device is still doing work in the stream hStream when DestroyStrea() is called, the function will return immediately and the resources associated with hStream will be released automatically once the device has completed all work in hStream.
func (Stream) Flags ¶
func (hStream Stream) Flags() (flags StreamFlags, err error)
func (Stream) Synchronize ¶
func (Stream) WaitOnValue32 ¶
type StreamFlags ¶
type StreamFlags byte
StreamFlags are flags for stream behaviours
const ( DefaultStream StreamFlags = C.CU_STREAM_DEFAULT // Default stream flag NonBlocking StreamFlags = C.CU_STREAM_NON_BLOCKING // Stream does not synchronize with stream 0 (the NULL stream) )
type TexRef ¶
type TexRef struct {
// contains filtered or unexported fields
}
func (TexRef) AddressMode ¶
func (hTexRef TexRef) AddressMode(dim int) (pam AddressMode, err error)
func (TexRef) BorderColor ¶
func (TexRef) FilterMode ¶
func (hTexRef TexRef) FilterMode() (pfm FilterMode, err error)
func (TexRef) Flags ¶
func (hTexRef TexRef) Flags() (pFlags TexRefFlags, err error)
func (TexRef) MaxAnisotropy ¶
func (TexRef) SetAddress ¶
func (TexRef) SetAddress2D ¶
func (TexRef) SetAddressMode ¶
func (hTexRef TexRef) SetAddressMode(dim int, am AddressMode) (err error)
func (TexRef) SetBorderColor ¶
func (TexRef) SetFilterMode ¶
func (hTexRef TexRef) SetFilterMode(fm FilterMode) (err error)
func (TexRef) SetFlags ¶
func (hTexRef TexRef) SetFlags(Flags TexRefFlags) (err error)
func (TexRef) SetMaxAnisotropy ¶
func (TexRef) SetMipmapFilterMode ¶
func (hTexRef TexRef) SetMipmapFilterMode(fm FilterMode) (err error)
func (TexRef) SetMipmapLevelBias ¶
type TexRefFlags ¶
type TexRefFlags byte
const ( ReadAsInteger TexRefFlags = C.CU_TRSF_READ_AS_INTEGER // Override the texref format with a format inferred from the array. NormalizeCoordinates TexRefFlags = C.CU_TRSF_NORMALIZED_COORDINATES // Use normalized texture coordinates in the range [0,1) instead of [0,dim). SRGB TexRefFlags = C.CU_TRSF_READ_AS_INTEGER // Perform sRGB->linear conversion during texture read. )
Source Files ¶
- addressing.go
- api.go
- array.go
- attributes.go
- batch.go
- batchedPatterns.go
- cgoflags.go
- context.go
- convenience.go
- ctx.go
- ctx_api.go
- cu.go
- cucontext.go
- device.go
- errors.go
- event.go
- execution.go
- fake.go
- flags.go
- graph.go
- hostfunction.go
- jit.go
- memory.go
- module.go
- occupancy.go
- params.go
- release.go
- result.go
- stream.go
- surfref.go
- texref.go
Directories ¶
Path | Synopsis |
---|---|
cmd
|
|
cudatest
cudatest tests the existence of CUDA by running a simple Go program that uses CUDA.
|
cudatest tests the existence of CUDA by running a simple Go program that uses CUDA. |
gencublas
generate_blas creates a blas.go file from the provided C header file with optionally added documentation from the documentation package.
|
generate_blas creates a blas.go file from the provided C header file with optionally added documentation from the documentation package. |